diff --git a/.buildinfo b/.buildinfo index 85e01e148..a975dc02d 100644 --- a/.buildinfo +++ b/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: e54d1b4e788f1dd051bb72ae18aa701c +config: 951ec2f7d15cc471c5801ca0c58642a5 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/_c_make_c_compiler_id_8c.html b/_c_make_c_compiler_id_8c.html new file mode 100644 index 000000000..912a6d567 --- /dev/null +++ b/_c_make_c_compiler_id_8c.html @@ -0,0 +1,350 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/CMakeFiles/3.28.1/CompilerIdC/CMakeCCompilerId.c File Reference + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+
CMakeCCompilerId.c File Reference
+
+
+

Macro Definition Documentation

+ +

◆ __has_include

+ +
+
+ + + + + + + +
#define __has_include( x)   0
+
+ +
+
+ +

◆ ARCHITECTURE_ID

+ +
+
+ + + + +
#define ARCHITECTURE_ID
+
+ +
+
+ +

◆ C_VERSION

+ +
+
+ + + + +
#define C_VERSION
+
+ +
+
+ +

◆ COMPILER_ID

+ +
+
+ + + + +
#define COMPILER_ID   ""
+
+ +
+
+ +

◆ DEC

+ +
+
+ + + + + + + +
#define DEC( n)
+
+Value:
('0' + (((n) / 10000000)%10)), \
+
('0' + (((n) / 1000000)%10)), \
+
('0' + (((n) / 100000)%10)), \
+
('0' + (((n) / 10000)%10)), \
+
('0' + (((n) / 1000)%10)), \
+
('0' + (((n) / 100)%10)), \
+
('0' + (((n) / 10)%10)), \
+
('0' + ((n) % 10))
+
+
+
+ +

◆ HEX

+ +
+
+ + + + + + + +
#define HEX( n)
+
+Value:
('0' + ((n)>>28 & 0xF)), \
+
('0' + ((n)>>24 & 0xF)), \
+
('0' + ((n)>>20 & 0xF)), \
+
('0' + ((n)>>16 & 0xF)), \
+
('0' + ((n)>>12 & 0xF)), \
+
('0' + ((n)>>8 & 0xF)), \
+
('0' + ((n)>>4 & 0xF)), \
+
('0' + ((n) & 0xF))
+
+
+
+ +

◆ PLATFORM_ID

+ +
+
+ + + + +
#define PLATFORM_ID
+
+ +
+
+ +

◆ STRINGIFY

+ +
+
+ + + + + + + +
#define STRINGIFY( X)   STRINGIFY_HELPER(X)
+
+ +
+
+ +

◆ STRINGIFY_HELPER

+ +
+
+ + + + + + + +
#define STRINGIFY_HELPER( X)   #X
+
+ +
+
+

Function Documentation

+ +

◆ main()

+ +
+
+ + + + + + + + + + + +
int main (int argc,
char * argv[] )
+
+ +
+
+

Variable Documentation

+ +

◆ info_arch

+ +
+
+ + + + +
char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]"
+
+ +
+
+ +

◆ info_compiler

+ +
+
+ + + + +
char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]"
+
+ +
+
+ +

◆ info_language_extensions_default

+ +
+
+ + + + +
const char* info_language_extensions_default
+
+Initial value:
= "INFO" ":" "extensions_default["
+
+
+
+
+
+
"OFF"
+
+
"]"
+
+
+
+ +

◆ info_language_standard_default

+ +
+
+ + + + +
const char* info_language_standard_default
+
+Initial value:
=
+
"INFO" ":" "standard_default[" C_VERSION "]"
+
#define C_VERSION
Definition CMakeCCompilerId.c:819
+
+
+
+ +

◆ info_platform

+ +
+
+ + + + +
char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]"
+
+ +
+
+
+ + + + diff --git a/_c_make_c_x_x_compiler_id_8cpp.html b/_c_make_c_x_x_compiler_id_8cpp.html new file mode 100644 index 000000000..283623ae0 --- /dev/null +++ b/_c_make_c_x_x_compiler_id_8cpp.html @@ -0,0 +1,362 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/CMakeFiles/3.28.1/CompilerIdCXX/CMakeCXXCompilerId.cpp File Reference + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+
CMakeCXXCompilerId.cpp File Reference
+
+
+

Macro Definition Documentation

+ +

◆ __has_include

+ +
+
+ + + + + + + +
#define __has_include( x)   0
+
+ +
+
+ +

◆ ARCHITECTURE_ID

+ +
+
+ + + + +
#define ARCHITECTURE_ID
+
+ +
+
+ +

◆ COMPILER_ID

+ +
+
+ + + + +
#define COMPILER_ID   ""
+
+ +
+
+ +

◆ CXX_STD

+ +
+
+ + + + +
#define CXX_STD   __cplusplus
+
+ +
+
+ +

◆ DEC

+ +
+
+ + + + + + + +
#define DEC( n)
+
+Value:
('0' + (((n) / 10000000)%10)), \
+
('0' + (((n) / 1000000)%10)), \
+
('0' + (((n) / 100000)%10)), \
+
('0' + (((n) / 10000)%10)), \
+
('0' + (((n) / 1000)%10)), \
+
('0' + (((n) / 100)%10)), \
+
('0' + (((n) / 10)%10)), \
+
('0' + ((n) % 10))
+
+
+
+ +

◆ HEX

+ +
+
+ + + + + + + +
#define HEX( n)
+
+Value:
('0' + ((n)>>28 & 0xF)), \
+
('0' + ((n)>>24 & 0xF)), \
+
('0' + ((n)>>20 & 0xF)), \
+
('0' + ((n)>>16 & 0xF)), \
+
('0' + ((n)>>12 & 0xF)), \
+
('0' + ((n)>>8 & 0xF)), \
+
('0' + ((n)>>4 & 0xF)), \
+
('0' + ((n) & 0xF))
+
+
+
+ +

◆ PLATFORM_ID

+ +
+
+ + + + +
#define PLATFORM_ID
+
+ +
+
+ +

◆ STRINGIFY

+ +
+
+ + + + + + + +
#define STRINGIFY( X)   STRINGIFY_HELPER(X)
+
+ +
+
+ +

◆ STRINGIFY_HELPER

+ +
+
+ + + + + + + +
#define STRINGIFY_HELPER( X)   #X
+
+ +
+
+

Function Documentation

+ +

◆ main()

+ +
+
+ + + + + + + + + + + +
int main (int argc,
char * argv[] )
+
+ +
+
+

Variable Documentation

+ +

◆ info_arch

+ +
+
+ + + + +
char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]"
+
+ +
+
+ +

◆ info_compiler

+ +
+
+ + + + +
char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]"
+
+ +
+
+ +

◆ info_language_extensions_default

+ +
+
+ + + + +
const char* info_language_extensions_default
+
+Initial value:
= "INFO" ":" "extensions_default["
+
+
+
+
+
+
"OFF"
+
+
"]"
+
+
+
+ +

◆ info_language_standard_default

+ +
+
+ + + + +
const char* info_language_standard_default
+
+Initial value:
= "INFO" ":" "standard_default["
+
+
+
+
+
+
+
+
+
+
+
+
"98"
+
+
"]"
+
+
+
+ +

◆ info_platform

+ +
+
+ + + + +
char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]"
+
+ +
+
+
+ + + + diff --git a/_images/ExampleDocsOutput.png b/_images/ExampleDocsOutput.png new file mode 100644 index 000000000..162c5610d Binary files /dev/null and b/_images/ExampleDocsOutput.png differ diff --git a/_sources/cpp-api/memory_utils.rst.txt b/_sources/cpp-api/memory_utils.rst.txt index e1792d0cc..81c854f94 100644 --- a/_sources/cpp-api/memory_utils.rst.txt +++ b/_sources/cpp-api/memory_utils.rst.txt @@ -1,5 +1,5 @@ CUDA Memory Operators ===================== -.. doxygengroup:: memory-utils +.. doxygengroup:: cumem-utils :content-only: diff --git a/_sources/cpp-api/quantize_ops.rst.txt b/_sources/cpp-api/quantize_ops.rst.txt index c55bf817b..70ed43d02 100644 --- a/_sources/cpp-api/quantize_ops.rst.txt +++ b/_sources/cpp-api/quantize_ops.rst.txt @@ -2,7 +2,8 @@ Quantization Operators =========================== Quantization is a model optimization technique to reduce the size of a large -model in order to achieve better storage performance with a small loss in accuracy. +model in order to achieve better storage performance with a small loss in +accuracy. CUDA Operators -------------- diff --git a/_sources/cpp-api/sparse_ops.rst.txt b/_sources/cpp-api/sparse_ops.rst.txt index 16a7d05a6..13752778d 100644 --- a/_sources/cpp-api/sparse_ops.rst.txt +++ b/_sources/cpp-api/sparse_ops.rst.txt @@ -1,13 +1,13 @@ Sparse Data Operators ===================== -Sparse Data CUDA Operators +CUDA Operators -------------------------- .. doxygengroup:: sparse-data-cuda :content-only: -Sparse Data CPU Operators +CPU Operators -------------------------- .. doxygengroup:: sparse-data-cpu diff --git a/_sources/general/BuildInstructions.rst.txt b/_sources/general/BuildInstructions.rst.txt index e51bf36f9..4a1734d47 100644 --- a/_sources/general/BuildInstructions.rst.txt +++ b/_sources/general/BuildInstructions.rst.txt @@ -1,8 +1,8 @@ Build Instructions ================== -**Note:** The most up-to-date instructions are embedded in a set of scripts -bundled in the FBGEMM_GPU repo under +**Note:** The most up-to-date build instructions are embedded in a set of +scripts bundled in the FBGEMM_GPU repo under `setup_env.bash `_. The general steps for building FBGEMM_GPU are as follows: @@ -255,6 +255,7 @@ Install the other necessary build tools such as ``ninja``, ``cmake``, etc: cmake \ hypothesis \ jinja2 \ + make \ ninja \ numpy \ scikit-build \ @@ -379,6 +380,8 @@ build cache: python setup.py clean +.. _fbgemm-gpu.docs.build.process.cuda: + CUDA Build ~~~~~~~~~~ @@ -436,6 +439,8 @@ CUDA device, however, is not required for building the package. --nvml_lib_path=${NVML_LIB_PATH} \ -DTORCH_CUDA_ARCH_LIST="${cuda_arch_list}" +.. _fbgemm-gpu.docs.build.process.rocm: + ROCm Build ~~~~~~~~~~ @@ -474,6 +479,8 @@ the package. -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \ -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA" +.. _fbgemm-gpu.docs.build.process.cpu: + CPU-Only Build ~~~~~~~~~~~~~~ diff --git a/_sources/general/DocsInstructions.rst.txt b/_sources/general/DocsInstructions.rst.txt new file mode 100644 index 000000000..37cbd2868 --- /dev/null +++ b/_sources/general/DocsInstructions.rst.txt @@ -0,0 +1,235 @@ +Contributing Documentation +========================== + +FBGEMM_GPU provides extensive comments in its source files, which provide the +most authoritative and up-to-date documentation available for the package. + + +Building the API Documentation +------------------------------ + +**Note:** The most up-to-date documentation build instructions are embedded in +a set of scripts bundled in the FBGEMM_GPU repo under +`setup_env.bash `_. + +The general steps for building the FBGEMM_GPU documentation are as follows: + +#. Set up an isolated build environment. +#. Build FBGEMM_GPU (CPU variant). +#. Set up the documentation toolchain. +#. Run documentation build scripts. + +Set Up Build Environment +~~~~~~~~~~~~~~~~~~~~~~~~ + +Follow the instructions for setting up the Conda environment at +:ref:`fbgemm-gpu.docs.build.setup.env`. + +Build FBGEMM_GPU +~~~~~~~~~~~~~~~~ + +A build pass of FBGEMM_GPU is required for the documentation to be built +correctly. Follow the instructions in +:ref:`fbgemm-gpu.docs.build.setup.tools.install`, followed by +:ref:`fbgemm-gpu.docs.build.process.cpu`, to build FBGEMM_GPU (CPU variant). + +Set Up Documentation Toolchain +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: sh + + # !! Run inside the Conda environment !! + + # From the /fbgemm_gpu/ directory + cd docs + + # Install Sphinx and other docs tools + pip install -r requirements.txt + + # Install Doxygen and Make + conda install -c conda-forge -y doxygen make + +Build the Documentation +~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: sh + + # Generate the C++ documentation + make doxygen + + # Generate the Python documentation and assemble together with the C++ documentation + make html + +After the build completes, view the generated documentation: + +.. code:: sh + + sphinx-serve -b build + +Deployment Preview +~~~~~~~~~~~~~~~~~~ + +As a PyTorch project, a preview of the FBGEMM_GPU documentation will be +automatically built and deployed by `Netlify `__ +when pull requests are made. When the build completes, the deployment preview +can be found at: + +.. code:: sh + + https://deploy-preview->--pytorch-fbgemm-docs.netlify.app/ + + +General Documentation Guidelines +-------------------------------- + +When new public API methods are added, they should be accompanied by sufficient +documentation. Here are some guidelines for documenting FBGEMM_GPU code: + +* Code by itself is not documentation! Put yourself in the shoes of new + developers who has to understand what your code does, and make their lives + easier. + +* Documentation should be added for any and all public API methods. + +* Don't leave docstring-writing as a separate task. + +* Write docstrings together with the code. + +* At a very minimum, add: + + * A description of the method. + * A description for each argument that can be passed into the method. + * A description of the method's return value. + +* Add usage examples, links to other methods, and method invocation limitations. + + +Adding Documentation to Python Code +----------------------------------- + +Documentation for Python is provided through docstrings and generated using +`Sphinx `__. Please reference the +`Google-style Python docstrings +`__ +guide for docstring formatting examples. + +Please add Python docstrings to the ``.py`` files under the name of the +method: + +.. code:: python + + def example_function(): + """ + This class is an example of how you can write docstrings. + You can add multiple lines of those descriptions. Make sure to include + useful information about your method. + + Args: + arg1 (int): This is the first arg that you can pass with this function. + + Returns: + This function returns X. + + Raises: + AttributeError: This function raises an error. + + Example: + This is how you can use this function + + >>> print("Code blocks are supported") + + Note: + You can find more information + """ + +Adding docstrings does not automatically publish them to the package +documentation. To publish new docstrings: + +#. Add the module method to its corresponding ``.rst`` file. + +#. To preview locally, run ``make html``. + +#. Verify the changes by building the docs locally or submitting a PR for a + Netlify preview. + + +Adding Documentation to C++ Code +-------------------------------- + +Documentation for C++ is provided through +`Javadoc-style comments `__ +and generated using Sphinx + `Doxygen `__ + +`Breathe `__. + + +Documentation is kept in header files with the ``.h`` extension as well as in +``.cpp``, ``cu``, and ``cuh`` files. In these files, everything between +``#ifndef DOXYGEN_THIS_WILL_BE_SKIPPED`` and ``#endif`` will be hidden from the +HTML output. At the moment, undocumented functions are hidden in these tags. +When you add descriptionss to a function, make sure that the ``#ifndef`` and +``#endif`` are configured correctly. + +All functions are grouped by a specific group for better organization. +Make sure you add ``@defgroup`` to the code comments. + +Follow these instructions to document, generate, and publish a new C++ +description: + +#. Add a description to the source header file. At a very minimum, add a + description verbatim, parameters by using the ``@param`` tag, and + return value by using the @return tag. You can other tags as needed. + Here is an example of how it can look: + + .. code:: cpp + + /// @defgroup example-method-group Example Method Group + /// This is a description of the example method group. + + /// @ingroup example-method-group + /// Description of `example_method` + /// + /// **Example:** + /// ```python + /// # Here is a Python code block + /// def foo(lst: List[int]): + /// return [ x ** 2 for x in lst ] + /// ``` + /// + /// @param param1 Description of param #1 + /// @param param2 Description of param #2 + /// + /// @return Description of the method's return value. + /// @throw fbgemm_gpu::my_error if an error occurs + /// + /// @note This is an example note. + /// @warning This is an example warning. + /// @see For more info, see here. + int32_t example_method(bool foo, float bar); + +#. Add a ``doxygengroup`` directive to the corresponding ``.rst`` file. If + an ``.rst`` file for the corresponding header file does not exist, create a + new one by the same name as the header file. If an ``.rst`` file already + exists, make sure the ``doxygengroup`` is defined in that file. + Using the above example: + + .. code:: rst + + Example Methods Group + --------------------- + + .. doxygengroup:: example-method-group + :content-only: + + This example generates the following HTML output: + + .. image:: ExampleDocsOutput.png + +#. Make sure the ``.rst`` file is included in to the ``toctree`` in + ``index.rst`` (:ref:`fbgemm-gpu.docs.toc.cpp`). + +#. The C++ source header file needs to be in one of the directories listed in + the ``INPUT`` parameter in ``Doxygen.ini``. If it's in a directory not + listed, be sure to append the directory path to the parameter. + +#. Verify the changes by building the docs locally or submitting a PR for a + Netlify preview. diff --git a/_sources/general/InstallationInstructions.rst.txt b/_sources/general/InstallationInstructions.rst.txt index 33a1ed775..873fbef5e 100644 --- a/_sources/general/InstallationInstructions.rst.txt +++ b/_sources/general/InstallationInstructions.rst.txt @@ -1,8 +1,8 @@ Installation Instructions ========================= -**Note:** The most up-to-date instructions are embedded in a set of scripts -bundled in the FBGEMM_GPU repo under +**Note:** The most up-to-date installation instructions are embedded in a set +of scripts bundled in the FBGEMM_GPU repo under `setup_env.bash `_. The general steps for installing FBGEMM_GPU are as follows: diff --git a/_sources/index.rst.txt b/_sources/index.rst.txt index c81aaac64..2b1139d46 100644 --- a/_sources/index.rst.txt +++ b/_sources/index.rst.txt @@ -9,6 +9,8 @@ Welcome to FBGEMM's documentation! This documentation provides a comprehensive reference of the `fbgemm_gpu` library. +.. _fbgemm-gpu.docs.toc.general: + .. toctree:: :maxdepth: 2 :caption: FBGEMM_GPU General Info @@ -16,7 +18,9 @@ library. general/BuildInstructions.rst general/InstallationInstructions.rst general/TestInstructions.rst + general/DocsInstructions.rst +.. _fbgemm-gpu.docs.toc.python: .. toctree:: :maxdepth: 2 @@ -25,6 +29,7 @@ library. python-api/table_batched_embedding_ops.rst python-api/jagged_tensor_ops.rst +.. _fbgemm-gpu.docs.toc.cpp: .. toctree:: :maxdepth: 2 diff --git a/_sources/pytorch-sphinx-theme/CODE_OF_CONDUCT.md.txt b/_sources/pytorch-sphinx-theme/CODE_OF_CONDUCT.md.txt deleted file mode 100644 index 4bd525a54..000000000 --- a/_sources/pytorch-sphinx-theme/CODE_OF_CONDUCT.md.txt +++ /dev/null @@ -1,76 +0,0 @@ -# Code of Conduct - -## Our Pledge - -In the interest of fostering an open and welcoming environment, we as -contributors and maintainers pledge to make participation in our project and -our community a harassment-free experience for everyone, regardless of age, body -size, disability, ethnicity, sex characteristics, gender identity and expression, -level of experience, education, socio-economic status, nationality, personal -appearance, race, religion, or sexual identity and orientation. - -## Our Standards - -Examples of behavior that contributes to creating a positive environment -include: - -* Using welcoming and inclusive language -* Being respectful of differing viewpoints and experiences -* Gracefully accepting constructive criticism -* Focusing on what is best for the community -* Showing empathy towards other community members - -Examples of unacceptable behavior by participants include: - -* The use of sexualized language or imagery and unwelcome sexual attention or -advances -* Trolling, insulting/derogatory comments, and personal or political attacks -* Public or private harassment -* Publishing others' private information, such as a physical or electronic -address, without explicit permission -* Other conduct which could reasonably be considered inappropriate in a -professional setting - -## Our Responsibilities - -Project maintainers are responsible for clarifying the standards of acceptable -behavior and are expected to take appropriate and fair corrective action in -response to any instances of unacceptable behavior. - -Project maintainers have the right and responsibility to remove, edit, or -reject comments, commits, code, wiki edits, issues, and other contributions -that are not aligned to this Code of Conduct, or to ban temporarily or -permanently any contributor for other behaviors that they deem inappropriate, -threatening, offensive, or harmful. - -## Scope - -This Code of Conduct applies within all project spaces, and it also applies when -an individual is representing the project or its community in public spaces. -Examples of representing a project or community include using an official -project e-mail address, posting via an official social media account, or acting -as an appointed representative at an online or offline event. Representation of -a project may be further defined and clarified by project maintainers. - -## Enforcement - -Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported by contacting the project team at . All -complaints will be reviewed and investigated and will result in a response that -is deemed necessary and appropriate to the circumstances. The project team is -obligated to maintain confidentiality with regard to the reporter of an incident. -Further details of specific enforcement policies may be posted separately. - -Project maintainers who do not follow or enforce the Code of Conduct in good -faith may face temporary or permanent repercussions as determined by other -members of the project's leadership. - -## Attribution - -This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, -available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html - -[homepage]: https://www.contributor-covenant.org - -For answers to common questions about this code of conduct, see -https://www.contributor-covenant.org/faq diff --git a/_sources/pytorch-sphinx-theme/CONTRIBUTING.md.txt b/_sources/pytorch-sphinx-theme/CONTRIBUTING.md.txt deleted file mode 100644 index e1655f56d..000000000 --- a/_sources/pytorch-sphinx-theme/CONTRIBUTING.md.txt +++ /dev/null @@ -1,31 +0,0 @@ -# Contributing to pytorch_sphinx_theme -We want to make contributing to this project as easy and transparent as -possible. - -## Pull Requests -We actively welcome your pull requests. - -1. Fork the repo and create your branch from `master`. -2. If you've added code that should be tested, add tests. -3. If you've changed APIs, update the documentation. -4. Ensure the test suite passes. -5. Make sure your code lints. -6. If you haven't already, complete the Contributor License Agreement ("CLA"). - -## Contributor License Agreement ("CLA") -In order to accept your pull request, we need you to submit a CLA. You only need -to do this once to work on any of Facebook's open source projects. - -Complete your CLA here: - -## Issues -We use GitHub issues to track public bugs. Please ensure your description is -clear and has sufficient instructions to be able to reproduce the issue. - -Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe -disclosure of security bugs. In those cases, please go through the process -outlined on that page and do not file a public issue. - -## License -By contributing to pytorch_sphinx_theme, you agree that your contributions will be licensed -under the LICENSE file in the root directory of this source tree. \ No newline at end of file diff --git a/_sources/pytorch-sphinx-theme/README.md.txt b/_sources/pytorch-sphinx-theme/README.md.txt deleted file mode 100644 index 5d2aeeb83..000000000 --- a/_sources/pytorch-sphinx-theme/README.md.txt +++ /dev/null @@ -1,188 +0,0 @@ -# PyTorch Sphinx Theme - -Sphinx theme for [PyTorch Docs](https://pytorch.org/docs/master/torch.html) and [PyTorch Tutorials](https://pytorch.org/tutorials) based on the [Read the Docs Sphinx Theme](https://sphinx-rtd-theme.readthedocs.io/en/latest). - -## Local Development - -Run python setup: - -``` -git clone https://github.com/pytorch/pytorch_sphinx_theme -pip install -e pytorch_sphinx_theme -``` - -and install the dependencies using `pip install -r docs/requirements.txt` - -In the root directory install the `package.json`: - -``` -# node version 8.4.0 -yarn install -``` - -If you have `npm` installed then run: - -``` -npm install -``` - -- If you want to see generated documentation for `docs/demo` then create -`.env.json` file and make it empty json file. Means `.env.json file` will -contain - -``` -{} -``` - -Run grunt to build the html site and enable live reloading of the demo app at `localhost:1919`: - -``` -grunt -``` - -- If you want to specify the project folder (docs or tutorial for which -you want to see docs generated) then you need to specify it into `.env.json` -file: - -``` -{ - "DOCS_DIR": "docs/", - "TUTORIALS_DIR": "path/to/tutorial/directory" -} -``` - -Run grunt to build the html site for docs: - -``` -grunt --project=docs -``` - -and to build the html site for tutorial: - -``` -grunt --project=tutorials -``` - -The resulting site is a demo. - -## Testing your changes and submitting a PR - -When you are ready to submit a PR with your changes you can first test that your changes have been applied correctly against either the PyTorch Docs or Tutorials repo: - -1. Run the `grunt build` task on your branch and commit the build to Github. -2. In your local docs or tutorials repo, remove any existing `pytorch_sphinx_theme` packages in the `src` folder (there should be a `pip-delete-this-directory.txt` file there) -3. Clone the repo locally `git clone https://github.com/pytorch/pytorch_sphinx_theme` -4. Install `pytorch_sphinx_theme` by running `pip install -e pytorch_sphinx_theme` -5. Install the requirements `pip install -r requirements.txt` -6. Remove the current build. In the docs this is `make clean`, tutorials is `make clean-cache` -7. Build the static site. In the docs this is `make html`, tutorials is `make html-noplot` -8. Open the site and look around. In the docs open `docs/build/html/index.html`, in the tutorials open `_build/html.index.html` - -If your changes have been applied successfully, remove the build commit from your branch and submit your PR. - -## Publishing the theme - -Before the new changes are visible in the theme the maintainer will need to run the build process: - -``` -grunt build -``` - -Once that is successful commit the change to Github. - -### Developing locally against PyTorch Docs and Tutorials - -To be able to modify and preview the theme locally against the PyTorch Docs and/or the PyTorch Tutorials first clone the repositories: - -- [PyTorch (Docs)](https://github.com/pytorch/pytorch) -- [PyTorch Tutorials](https://github.com/pytorch/tutorials) - -Then follow the instructions in each repository to make the docs. - -Once the docs have been successfully generated you should be able to run the following to create an html build. - -#### Docs - -``` -# in ./docs -make html -``` - -#### Tutorials - -``` -# root directory -make html -``` - -Once these are successful, navigate to the `conf.py` file in each project. In the Docs these are at `./docs/source`. The Tutorials one can be found in the root directory. - -In `conf.py` change the html theme to `pytorch_sphinx_theme` and point the html theme path to this repo's local folder, which will end up looking something like: - -``` -html_theme = 'pytorch_sphinx_theme' -html_theme_path = ["../../../pytorch_sphinx_theme"] -``` - -Next create a file `.env.json` in the root of this repo with some keys/values referencing the local folders of the Docs and Tutorials repos: - -``` -{ - "TUTORIALS_DIR": "../tutorials", - "DOCS_DIR": "../pytorch/docs/source" -} - -``` - -You can then build the Docs or Tutorials by running - -``` -grunt --project=docs -``` -or - -``` -grunt --project=tutorials -``` - -These will generate a live-reloaded local build for the respective projects available at `localhost:1919`. - -Note that while live reloading works these two projects are hefty and will take a few seconds to build and reload, especially the Docs. - -### Built-in Stylesheets and Fonts - -There are a couple of stylesheets and fonts inside the Docs and Tutorials repos themselves meant to override the existing theme. To ensure the most accurate styles we should comment out those files until the maintainers of those repos remove them: - -#### Docs - -``` -# ./docs/source/conf.py - -html_context = { - # 'css_files': [ - # 'https://fonts.googleapis.com/css?family=Lato', - # '_static/css/pytorch_theme.css' - # ], -} -``` - -#### Tutorials - -``` -# ./conf.py - -# app.add_stylesheet('css/pytorch_theme.css') -# app.add_stylesheet('https://fonts.googleapis.com/css?family=Lato') -``` - -### Top/Mobile Navigation - -The top navigation and mobile menu expect an "active" state for one of the menu items. To ensure that either "Docs" or "Tutorials" is marked as active, set the following config value in the respective `conf.py`, where `{project}` is either `"docs"` or `"tutorials"`. - -``` -html_theme_options = { - ... - 'pytorch_project': {project} - ... -} -``` diff --git a/_static/basic.css b/_static/basic.css index 24a49f09b..7577acb1a 100644 --- a/_static/basic.css +++ b/_static/basic.css @@ -4,7 +4,7 @@ * * Sphinx stylesheet -- basic theme. * - * :copyright: Copyright 2007-2020 by the Sphinx team, see AUTHORS. + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. * :license: BSD, see LICENSE for details. * */ @@ -130,7 +130,7 @@ ul.search li a { font-weight: bold; } -ul.search li div.context { +ul.search li p.context { color: #888; margin: 2px 0 0 30px; text-align: left; @@ -222,7 +222,7 @@ table.modindextable td { /* -- general body styles --------------------------------------------------- */ div.body { - min-width: 450px; + min-width: 360px; max-width: 800px; } @@ -237,16 +237,6 @@ a.headerlink { visibility: hidden; } -a.brackets:before, -span.brackets > a:before{ - content: "["; -} - -a.brackets:after, -span.brackets > a:after { - content: "]"; -} - h1:hover > a.headerlink, h2:hover > a.headerlink, h3:hover > a.headerlink, @@ -277,25 +267,25 @@ p.rubric { font-weight: bold; } -img.align-left, .figure.align-left, object.align-left { +img.align-left, figure.align-left, .figure.align-left, object.align-left { clear: left; float: left; margin-right: 1em; } -img.align-right, .figure.align-right, object.align-right { +img.align-right, figure.align-right, .figure.align-right, object.align-right { clear: right; float: right; margin-left: 1em; } -img.align-center, .figure.align-center, object.align-center { +img.align-center, figure.align-center, .figure.align-center, object.align-center { display: block; margin-left: auto; margin-right: auto; } -img.align-default, .figure.align-default { +img.align-default, figure.align-default, .figure.align-default { display: block; margin-left: auto; margin-right: auto; @@ -319,7 +309,8 @@ img.align-default, .figure.align-default { /* -- sidebars -------------------------------------------------------------- */ -div.sidebar { +div.sidebar, +aside.sidebar { margin: 0 0 0.5em 1em; border: 1px solid #ddb; padding: 7px; @@ -334,12 +325,16 @@ p.sidebar-title { font-weight: bold; } +nav.contents, +aside.topic, div.admonition, div.topic, blockquote { clear: left; } /* -- topics ---------------------------------------------------------------- */ +nav.contents, +aside.topic, div.topic { border: 1px solid #ccc; padding: 7px; @@ -377,12 +372,18 @@ div.body p.centered { /* -- content of sidebars/topics/admonitions -------------------------------- */ div.sidebar > :last-child, +aside.sidebar > :last-child, +nav.contents > :last-child, +aside.topic > :last-child, div.topic > :last-child, div.admonition > :last-child { margin-bottom: 0; } div.sidebar::after, +aside.sidebar::after, +nav.contents::after, +aside.topic::after, div.topic::after, div.admonition::after, blockquote::after { @@ -425,10 +426,6 @@ table.docutils td, table.docutils th { border-bottom: 1px solid #aaa; } -table.footnote td, table.footnote th { - border: 0 !important; -} - th { text-align: left; padding-right: 5px; @@ -455,20 +452,22 @@ td > :last-child { /* -- figures --------------------------------------------------------------- */ -div.figure { +div.figure, figure { margin: 0.5em; padding: 0.5em; } -div.figure p.caption { +div.figure p.caption, figcaption { padding: 0.3em; } -div.figure p.caption span.caption-number { +div.figure p.caption span.caption-number, +figcaption span.caption-number { font-style: italic; } -div.figure p.caption span.caption-text { +div.figure p.caption span.caption-text, +figcaption span.caption-text { } /* -- field list styles ----------------------------------------------------- */ @@ -503,6 +502,63 @@ table.hlist td { vertical-align: top; } +/* -- object description styles --------------------------------------------- */ + +.sig { + font-family: 'Consolas', 'Menlo', 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', monospace; +} + +.sig-name, code.descname { + background-color: transparent; + font-weight: bold; +} + +.sig-name { + font-size: 1.1em; +} + +code.descname { + font-size: 1.2em; +} + +.sig-prename, code.descclassname { + background-color: transparent; +} + +.optional { + font-size: 1.3em; +} + +.sig-paren { + font-size: larger; +} + +.sig-param.n { + font-style: italic; +} + +/* C++ specific styling */ + +.sig-inline.c-texpr, +.sig-inline.cpp-texpr { + font-family: unset; +} + +.sig.c .k, .sig.c .kt, +.sig.cpp .k, .sig.cpp .kt { + color: #0033B3; +} + +.sig.c .m, +.sig.cpp .m { + color: #1750EB; +} + +.sig.c .s, .sig.c .sc, +.sig.cpp .s, .sig.cpp .sc { + color: #067D17; +} + /* -- other body styles ----------------------------------------------------- */ @@ -553,19 +609,26 @@ ul.simple p { margin-bottom: 0; } -dl.footnote > dt, -dl.citation > dt { +aside.footnote > span, +div.citation > span { float: left; - margin-right: 0.5em; } - -dl.footnote > dd, -dl.citation > dd { +aside.footnote > span:last-of-type, +div.citation > span:last-of-type { + padding-right: 0.5em; +} +aside.footnote > p { + margin-left: 2em; +} +div.citation > p { + margin-left: 4em; +} +aside.footnote > p:last-of-type, +div.citation > p:last-of-type { margin-bottom: 0em; } - -dl.footnote > dd:after, -dl.citation > dd:after { +aside.footnote > p:last-of-type:after, +div.citation > p:last-of-type:after { content: ""; clear: both; } @@ -582,10 +645,6 @@ dl.field-list > dt { padding-right: 5px; } -dl.field-list > dt:after { - content: ":"; -} - dl.field-list > dd { padding-left: 0.5em; margin-top: 0em; @@ -629,14 +688,6 @@ dl.glossary dt { font-size: 1.1em; } -.optional { - font-size: 1.3em; -} - -.sig-paren { - font-size: larger; -} - .versionmodified { font-style: italic; } @@ -677,8 +728,9 @@ dl.glossary dt { .classifier:before { font-style: normal; - margin: 0.5em; + margin: 0 0.5em; content: ":"; + display: inline-block; } abbr, acronym { @@ -702,6 +754,7 @@ span.pre { -ms-hyphens: none; -webkit-hyphens: none; hyphens: none; + white-space: nowrap; } div[class*="highlight-"] { @@ -765,8 +818,12 @@ div.code-block-caption code { table.highlighttable td.linenos, span.linenos, -div.doctest > div.highlight span.gp { /* gp: Generic.Prompt */ - user-select: none; +div.highlight span.gp { /* gp: Generic.Prompt */ + user-select: none; + -webkit-user-select: text; /* Safari fallback only */ + -webkit-user-select: none; /* Chrome/Safari */ + -moz-user-select: none; /* Firefox */ + -ms-user-select: none; /* IE10+ */ } div.code-block-caption span.caption-number { @@ -781,16 +838,6 @@ div.literal-block-wrapper { margin: 1em 0; } -code.descname { - background-color: transparent; - font-weight: bold; - font-size: 1.2em; -} - -code.descclassname { - background-color: transparent; -} - code.xref, a code { background-color: transparent; font-weight: bold; diff --git a/_static/doctools.js b/_static/doctools.js index 7d88f807d..d06a71d75 100644 --- a/_static/doctools.js +++ b/_static/doctools.js @@ -2,315 +2,155 @@ * doctools.js * ~~~~~~~~~~~ * - * Sphinx JavaScript utilities for all documentation. + * Base JavaScript utilities for all Sphinx HTML documentation. * - * :copyright: Copyright 2007-2020 by the Sphinx team, see AUTHORS. + * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. * :license: BSD, see LICENSE for details. * */ - -/** - * select a different prefix for underscore - */ -$u = _.noConflict(); - -/** - * make the code below compatible with browsers without - * an installed firebug like debugger -if (!window.console || !console.firebug) { - var names = ["log", "debug", "info", "warn", "error", "assert", "dir", - "dirxml", "group", "groupEnd", "time", "timeEnd", "count", "trace", - "profile", "profileEnd"]; - window.console = {}; - for (var i = 0; i < names.length; ++i) - window.console[names[i]] = function() {}; -} - */ - -/** - * small helper function to urldecode strings - */ -jQuery.urldecode = function(x) { - return decodeURIComponent(x).replace(/\+/g, ' '); -}; - -/** - * small helper function to urlencode strings - */ -jQuery.urlencode = encodeURIComponent; - -/** - * This function returns the parsed url parameters of the - * current request. Multiple values per key are supported, - * it will always return arrays of strings for the value parts. - */ -jQuery.getQueryParameters = function(s) { - if (typeof s === 'undefined') - s = document.location.search; - var parts = s.substr(s.indexOf('?') + 1).split('&'); - var result = {}; - for (var i = 0; i < parts.length; i++) { - var tmp = parts[i].split('=', 2); - var key = jQuery.urldecode(tmp[0]); - var value = jQuery.urldecode(tmp[1]); - if (key in result) - result[key].push(value); - else - result[key] = [value]; +"use strict"; + +const BLACKLISTED_KEY_CONTROL_ELEMENTS = new Set([ + "TEXTAREA", + "INPUT", + "SELECT", + "BUTTON", +]); + +const _ready = (callback) => { + if (document.readyState !== "loading") { + callback(); + } else { + document.addEventListener("DOMContentLoaded", callback); } - return result; }; -/** - * highlight a given string on a jquery object by wrapping it in - * span elements with the given class name. - */ -jQuery.fn.highlightText = function(text, className) { - function highlight(node, addItems) { - if (node.nodeType === 3) { - var val = node.nodeValue; - var pos = val.toLowerCase().indexOf(text); - if (pos >= 0 && - !jQuery(node.parentNode).hasClass(className) && - !jQuery(node.parentNode).hasClass("nohighlight")) { - var span; - var isInSVG = jQuery(node).closest("body, svg, foreignObject").is("svg"); - if (isInSVG) { - span = document.createElementNS("http://www.w3.org/2000/svg", "tspan"); - } else { - span = document.createElement("span"); - span.className = className; - } - span.appendChild(document.createTextNode(val.substr(pos, text.length))); - node.parentNode.insertBefore(span, node.parentNode.insertBefore( - document.createTextNode(val.substr(pos + text.length)), - node.nextSibling)); - node.nodeValue = val.substr(0, pos); - if (isInSVG) { - var rect = document.createElementNS("http://www.w3.org/2000/svg", "rect"); - var bbox = node.parentElement.getBBox(); - rect.x.baseVal.value = bbox.x; - rect.y.baseVal.value = bbox.y; - rect.width.baseVal.value = bbox.width; - rect.height.baseVal.value = bbox.height; - rect.setAttribute('class', className); - addItems.push({ - "parent": node.parentNode, - "target": rect}); - } - } - } - else if (!jQuery(node).is("button, select, textarea")) { - jQuery.each(node.childNodes, function() { - highlight(this, addItems); - }); - } - } - var addItems = []; - var result = this.each(function() { - highlight(this, addItems); - }); - for (var i = 0; i < addItems.length; ++i) { - jQuery(addItems[i].parent).before(addItems[i].target); - } - return result; -}; - -/* - * backward compatibility for jQuery.browser - * This will be supported until firefox bug is fixed. - */ -if (!jQuery.browser) { - jQuery.uaMatch = function(ua) { - ua = ua.toLowerCase(); - - var match = /(chrome)[ \/]([\w.]+)/.exec(ua) || - /(webkit)[ \/]([\w.]+)/.exec(ua) || - /(opera)(?:.*version|)[ \/]([\w.]+)/.exec(ua) || - /(msie) ([\w.]+)/.exec(ua) || - ua.indexOf("compatible") < 0 && /(mozilla)(?:.*? rv:([\w.]+)|)/.exec(ua) || - []; - - return { - browser: match[ 1 ] || "", - version: match[ 2 ] || "0" - }; - }; - jQuery.browser = {}; - jQuery.browser[jQuery.uaMatch(navigator.userAgent).browser] = true; -} - /** * Small JavaScript module for the documentation. */ -var Documentation = { - - init : function() { - this.fixFirefoxAnchorBug(); - this.highlightSearchWords(); - this.initIndexTable(); - if (DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) { - this.initOnKeyListeners(); - } +const Documentation = { + init: () => { + Documentation.initDomainIndexTable(); + Documentation.initOnKeyListeners(); }, /** * i18n support */ - TRANSLATIONS : {}, - PLURAL_EXPR : function(n) { return n === 1 ? 0 : 1; }, - LOCALE : 'unknown', + TRANSLATIONS: {}, + PLURAL_EXPR: (n) => (n === 1 ? 0 : 1), + LOCALE: "unknown", // gettext and ngettext don't access this so that the functions // can safely bound to a different name (_ = Documentation.gettext) - gettext : function(string) { - var translated = Documentation.TRANSLATIONS[string]; - if (typeof translated === 'undefined') - return string; - return (typeof translated === 'string') ? translated : translated[0]; - }, - - ngettext : function(singular, plural, n) { - var translated = Documentation.TRANSLATIONS[singular]; - if (typeof translated === 'undefined') - return (n == 1) ? singular : plural; - return translated[Documentation.PLURALEXPR(n)]; + gettext: (string) => { + const translated = Documentation.TRANSLATIONS[string]; + switch (typeof translated) { + case "undefined": + return string; // no translation + case "string": + return translated; // translation exists + default: + return translated[0]; // (singular, plural) translation tuple exists + } }, - addTranslations : function(catalog) { - for (var key in catalog.messages) - this.TRANSLATIONS[key] = catalog.messages[key]; - this.PLURAL_EXPR = new Function('n', 'return +(' + catalog.plural_expr + ')'); - this.LOCALE = catalog.locale; + ngettext: (singular, plural, n) => { + const translated = Documentation.TRANSLATIONS[singular]; + if (typeof translated !== "undefined") + return translated[Documentation.PLURAL_EXPR(n)]; + return n === 1 ? singular : plural; }, - /** - * add context elements like header anchor links - */ - addContextElements : function() { - $('div[id] > :header:first').each(function() { - $('\u00B6'). - attr('href', '#' + this.id). - attr('title', _('Permalink to this headline')). - appendTo(this); - }); - $('dt[id]').each(function() { - $('\u00B6'). - attr('href', '#' + this.id). - attr('title', _('Permalink to this definition')). - appendTo(this); - }); + addTranslations: (catalog) => { + Object.assign(Documentation.TRANSLATIONS, catalog.messages); + Documentation.PLURAL_EXPR = new Function( + "n", + `return (${catalog.plural_expr})` + ); + Documentation.LOCALE = catalog.locale; }, /** - * workaround a firefox stupidity - * see: https://bugzilla.mozilla.org/show_bug.cgi?id=645075 + * helper function to focus on search bar */ - fixFirefoxAnchorBug : function() { - if (document.location.hash && $.browser.mozilla) - window.setTimeout(function() { - document.location.href += ''; - }, 10); + focusSearchBar: () => { + document.querySelectorAll("input[name=q]")[0]?.focus(); }, /** - * highlight the search words provided in the url in the text + * Initialise the domain index toggle buttons */ - highlightSearchWords : function() { - var params = $.getQueryParameters(); - var terms = (params.highlight) ? params.highlight[0].split(/\s+/) : []; - if (terms.length) { - var body = $('div.body'); - if (!body.length) { - body = $('body'); + initDomainIndexTable: () => { + const toggler = (el) => { + const idNumber = el.id.substr(7); + const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`); + if (el.src.substr(-9) === "minus.png") { + el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`; + toggledRows.forEach((el) => (el.style.display = "none")); + } else { + el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`; + toggledRows.forEach((el) => (el.style.display = "")); } - window.setTimeout(function() { - $.each(terms, function() { - body.highlightText(this.toLowerCase(), 'highlighted'); - }); - }, 10); - $('') - .appendTo($('#searchbox')); - } - }, - - /** - * init the domain index toggle buttons - */ - initIndexTable : function() { - var togglers = $('img.toggler').click(function() { - var src = $(this).attr('src'); - var idnum = $(this).attr('id').substr(7); - $('tr.cg-' + idnum).toggle(); - if (src.substr(-9) === 'minus.png') - $(this).attr('src', src.substr(0, src.length-9) + 'plus.png'); - else - $(this).attr('src', src.substr(0, src.length-8) + 'minus.png'); - }).css('display', ''); - if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) { - togglers.click(); - } - }, - - /** - * helper function to hide the search marks again - */ - hideSearchWords : function() { - $('#searchbox .highlight-link').fadeOut(300); - $('span.highlighted').removeClass('highlighted'); - }, - - /** - * make the url absolute - */ - makeURL : function(relativeURL) { - return DOCUMENTATION_OPTIONS.URL_ROOT + '/' + relativeURL; - }, + }; - /** - * get the current relative url - */ - getCurrentURL : function() { - var path = document.location.pathname; - var parts = path.split(/\//); - $.each(DOCUMENTATION_OPTIONS.URL_ROOT.split(/\//), function() { - if (this === '..') - parts.pop(); - }); - var url = parts.join('/'); - return path.substring(url.lastIndexOf('/') + 1, path.length - 1); + const togglerElements = document.querySelectorAll("img.toggler"); + togglerElements.forEach((el) => + el.addEventListener("click", (event) => toggler(event.currentTarget)) + ); + togglerElements.forEach((el) => (el.style.display = "")); + if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler); }, - initOnKeyListeners: function() { - $(document).keydown(function(event) { - var activeElementType = document.activeElement.tagName; - // don't navigate when in search box, textarea, dropdown or button - if (activeElementType !== 'TEXTAREA' && activeElementType !== 'INPUT' && activeElementType !== 'SELECT' - && activeElementType !== 'BUTTON' && !event.altKey && !event.ctrlKey && !event.metaKey - && !event.shiftKey) { - switch (event.keyCode) { - case 37: // left - var prevHref = $('link[rel="prev"]').prop('href'); - if (prevHref) { - window.location.href = prevHref; - return false; + initOnKeyListeners: () => { + // only install a listener if it is really needed + if ( + !DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS && + !DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS + ) + return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.altKey || event.ctrlKey || event.metaKey) return; + + if (!event.shiftKey) { + switch (event.key) { + case "ArrowLeft": + if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; + + const prevLink = document.querySelector('link[rel="prev"]'); + if (prevLink && prevLink.href) { + window.location.href = prevLink.href; + event.preventDefault(); } - case 39: // right - var nextHref = $('link[rel="next"]').prop('href'); - if (nextHref) { - window.location.href = nextHref; - return false; + break; + case "ArrowRight": + if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break; + + const nextLink = document.querySelector('link[rel="next"]'); + if (nextLink && nextLink.href) { + window.location.href = nextLink.href; + event.preventDefault(); } + break; } } + + // some keyboard layouts may need Shift to get / + switch (event.key) { + case "/": + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break; + Documentation.focusSearchBar(); + event.preventDefault(); + } }); - } + }, }; // quick alias for translations -_ = Documentation.gettext; +const _ = Documentation.gettext; -$(document).ready(function() { - Documentation.init(); -}); +_ready(Documentation.init); diff --git a/_static/documentation_options.js b/_static/documentation_options.js index 0030cfd35..c99fee7dd 100644 --- a/_static/documentation_options.js +++ b/_static/documentation_options.js @@ -1,12 +1,14 @@ var DOCUMENTATION_OPTIONS = { URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), VERSION: '0.1.2', - LANGUAGE: 'None', + LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', FILE_SUFFIX: '.html', LINK_SUFFIX: '.html', HAS_SOURCE: true, SOURCELINK_SUFFIX: '.txt', - NAVIGATION_WITH_KEYS: false + NAVIGATION_WITH_KEYS: false, + SHOW_SEARCH_SUMMARY: true, + ENABLE_SEARCH_SHORTCUTS: true, }; \ No newline at end of file diff --git a/_static/jquery-3.5.1.js b/_static/jquery-3.5.1.js deleted file mode 100644 index 50937333b..000000000 --- a/_static/jquery-3.5.1.js +++ /dev/null @@ -1,10872 +0,0 @@ -/*! - * jQuery JavaScript Library v3.5.1 - * https://jquery.com/ - * - * Includes Sizzle.js - * https://sizzlejs.com/ - * - * Copyright JS Foundation and other contributors - * Released under the MIT license - * https://jquery.org/license - * - * Date: 2020-05-04T22:49Z - */ -( function( global, factory ) { - - "use strict"; - - if ( typeof module === "object" && typeof module.exports === "object" ) { - - // For CommonJS and CommonJS-like environments where a proper `window` - // is present, execute the factory and get jQuery. - // For environments that do not have a `window` with a `document` - // (such as Node.js), expose a factory as module.exports. - // This accentuates the need for the creation of a real `window`. - // e.g. var jQuery = require("jquery")(window); - // See ticket #14549 for more info. - module.exports = global.document ? - factory( global, true ) : - function( w ) { - if ( !w.document ) { - throw new Error( "jQuery requires a window with a document" ); - } - return factory( w ); - }; - } else { - factory( global ); - } - -// Pass this if window is not defined yet -} )( typeof window !== "undefined" ? window : this, function( window, noGlobal ) { - -// Edge <= 12 - 13+, Firefox <=18 - 45+, IE 10 - 11, Safari 5.1 - 9+, iOS 6 - 9.1 -// throw exceptions when non-strict code (e.g., ASP.NET 4.5) accesses strict mode -// arguments.callee.caller (trac-13335). But as of jQuery 3.0 (2016), strict mode should be common -// enough that all such attempts are guarded in a try block. -"use strict"; - -var arr = []; - -var getProto = Object.getPrototypeOf; - -var slice = arr.slice; - -var flat = arr.flat ? function( array ) { - return arr.flat.call( array ); -} : function( array ) { - return arr.concat.apply( [], array ); -}; - - -var push = arr.push; - -var indexOf = arr.indexOf; - -var class2type = {}; - -var toString = class2type.toString; - -var hasOwn = class2type.hasOwnProperty; - -var fnToString = hasOwn.toString; - -var ObjectFunctionString = fnToString.call( Object ); - -var support = {}; - -var isFunction = function isFunction( obj ) { - - // Support: Chrome <=57, Firefox <=52 - // In some browsers, typeof returns "function" for HTML elements - // (i.e., `typeof document.createElement( "object" ) === "function"`). - // We don't want to classify *any* DOM node as a function. - return typeof obj === "function" && typeof obj.nodeType !== "number"; - }; - - -var isWindow = function isWindow( obj ) { - return obj != null && obj === obj.window; - }; - - -var document = window.document; - - - - var preservedScriptAttributes = { - type: true, - src: true, - nonce: true, - noModule: true - }; - - function DOMEval( code, node, doc ) { - doc = doc || document; - - var i, val, - script = doc.createElement( "script" ); - - script.text = code; - if ( node ) { - for ( i in preservedScriptAttributes ) { - - // Support: Firefox 64+, Edge 18+ - // Some browsers don't support the "nonce" property on scripts. - // On the other hand, just using `getAttribute` is not enough as - // the `nonce` attribute is reset to an empty string whenever it - // becomes browsing-context connected. - // See https://github.com/whatwg/html/issues/2369 - // See https://html.spec.whatwg.org/#nonce-attributes - // The `node.getAttribute` check was added for the sake of - // `jQuery.globalEval` so that it can fake a nonce-containing node - // via an object. - val = node[ i ] || node.getAttribute && node.getAttribute( i ); - if ( val ) { - script.setAttribute( i, val ); - } - } - } - doc.head.appendChild( script ).parentNode.removeChild( script ); - } - - -function toType( obj ) { - if ( obj == null ) { - return obj + ""; - } - - // Support: Android <=2.3 only (functionish RegExp) - return typeof obj === "object" || typeof obj === "function" ? - class2type[ toString.call( obj ) ] || "object" : - typeof obj; -} -/* global Symbol */ -// Defining this global in .eslintrc.json would create a danger of using the global -// unguarded in another place, it seems safer to define global only for this module - - - -var - version = "3.5.1", - - // Define a local copy of jQuery - jQuery = function( selector, context ) { - - // The jQuery object is actually just the init constructor 'enhanced' - // Need init if jQuery is called (just allow error to be thrown if not included) - return new jQuery.fn.init( selector, context ); - }; - -jQuery.fn = jQuery.prototype = { - - // The current version of jQuery being used - jquery: version, - - constructor: jQuery, - - // The default length of a jQuery object is 0 - length: 0, - - toArray: function() { - return slice.call( this ); - }, - - // Get the Nth element in the matched element set OR - // Get the whole matched element set as a clean array - get: function( num ) { - - // Return all the elements in a clean array - if ( num == null ) { - return slice.call( this ); - } - - // Return just the one element from the set - return num < 0 ? this[ num + this.length ] : this[ num ]; - }, - - // Take an array of elements and push it onto the stack - // (returning the new matched element set) - pushStack: function( elems ) { - - // Build a new jQuery matched element set - var ret = jQuery.merge( this.constructor(), elems ); - - // Add the old object onto the stack (as a reference) - ret.prevObject = this; - - // Return the newly-formed element set - return ret; - }, - - // Execute a callback for every element in the matched set. - each: function( callback ) { - return jQuery.each( this, callback ); - }, - - map: function( callback ) { - return this.pushStack( jQuery.map( this, function( elem, i ) { - return callback.call( elem, i, elem ); - } ) ); - }, - - slice: function() { - return this.pushStack( slice.apply( this, arguments ) ); - }, - - first: function() { - return this.eq( 0 ); - }, - - last: function() { - return this.eq( -1 ); - }, - - even: function() { - return this.pushStack( jQuery.grep( this, function( _elem, i ) { - return ( i + 1 ) % 2; - } ) ); - }, - - odd: function() { - return this.pushStack( jQuery.grep( this, function( _elem, i ) { - return i % 2; - } ) ); - }, - - eq: function( i ) { - var len = this.length, - j = +i + ( i < 0 ? len : 0 ); - return this.pushStack( j >= 0 && j < len ? [ this[ j ] ] : [] ); - }, - - end: function() { - return this.prevObject || this.constructor(); - }, - - // For internal use only. - // Behaves like an Array's method, not like a jQuery method. - push: push, - sort: arr.sort, - splice: arr.splice -}; - -jQuery.extend = jQuery.fn.extend = function() { - var options, name, src, copy, copyIsArray, clone, - target = arguments[ 0 ] || {}, - i = 1, - length = arguments.length, - deep = false; - - // Handle a deep copy situation - if ( typeof target === "boolean" ) { - deep = target; - - // Skip the boolean and the target - target = arguments[ i ] || {}; - i++; - } - - // Handle case when target is a string or something (possible in deep copy) - if ( typeof target !== "object" && !isFunction( target ) ) { - target = {}; - } - - // Extend jQuery itself if only one argument is passed - if ( i === length ) { - target = this; - i--; - } - - for ( ; i < length; i++ ) { - - // Only deal with non-null/undefined values - if ( ( options = arguments[ i ] ) != null ) { - - // Extend the base object - for ( name in options ) { - copy = options[ name ]; - - // Prevent Object.prototype pollution - // Prevent never-ending loop - if ( name === "__proto__" || target === copy ) { - continue; - } - - // Recurse if we're merging plain objects or arrays - if ( deep && copy && ( jQuery.isPlainObject( copy ) || - ( copyIsArray = Array.isArray( copy ) ) ) ) { - src = target[ name ]; - - // Ensure proper type for the source value - if ( copyIsArray && !Array.isArray( src ) ) { - clone = []; - } else if ( !copyIsArray && !jQuery.isPlainObject( src ) ) { - clone = {}; - } else { - clone = src; - } - copyIsArray = false; - - // Never move original objects, clone them - target[ name ] = jQuery.extend( deep, clone, copy ); - - // Don't bring in undefined values - } else if ( copy !== undefined ) { - target[ name ] = copy; - } - } - } - } - - // Return the modified object - return target; -}; - -jQuery.extend( { - - // Unique for each copy of jQuery on the page - expando: "jQuery" + ( version + Math.random() ).replace( /\D/g, "" ), - - // Assume jQuery is ready without the ready module - isReady: true, - - error: function( msg ) { - throw new Error( msg ); - }, - - noop: function() {}, - - isPlainObject: function( obj ) { - var proto, Ctor; - - // Detect obvious negatives - // Use toString instead of jQuery.type to catch host objects - if ( !obj || toString.call( obj ) !== "[object Object]" ) { - return false; - } - - proto = getProto( obj ); - - // Objects with no prototype (e.g., `Object.create( null )`) are plain - if ( !proto ) { - return true; - } - - // Objects with prototype are plain iff they were constructed by a global Object function - Ctor = hasOwn.call( proto, "constructor" ) && proto.constructor; - return typeof Ctor === "function" && fnToString.call( Ctor ) === ObjectFunctionString; - }, - - isEmptyObject: function( obj ) { - var name; - - for ( name in obj ) { - return false; - } - return true; - }, - - // Evaluates a script in a provided context; falls back to the global one - // if not specified. - globalEval: function( code, options, doc ) { - DOMEval( code, { nonce: options && options.nonce }, doc ); - }, - - each: function( obj, callback ) { - var length, i = 0; - - if ( isArrayLike( obj ) ) { - length = obj.length; - for ( ; i < length; i++ ) { - if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { - break; - } - } - } else { - for ( i in obj ) { - if ( callback.call( obj[ i ], i, obj[ i ] ) === false ) { - break; - } - } - } - - return obj; - }, - - // results is for internal usage only - makeArray: function( arr, results ) { - var ret = results || []; - - if ( arr != null ) { - if ( isArrayLike( Object( arr ) ) ) { - jQuery.merge( ret, - typeof arr === "string" ? - [ arr ] : arr - ); - } else { - push.call( ret, arr ); - } - } - - return ret; - }, - - inArray: function( elem, arr, i ) { - return arr == null ? -1 : indexOf.call( arr, elem, i ); - }, - - // Support: Android <=4.0 only, PhantomJS 1 only - // push.apply(_, arraylike) throws on ancient WebKit - merge: function( first, second ) { - var len = +second.length, - j = 0, - i = first.length; - - for ( ; j < len; j++ ) { - first[ i++ ] = second[ j ]; - } - - first.length = i; - - return first; - }, - - grep: function( elems, callback, invert ) { - var callbackInverse, - matches = [], - i = 0, - length = elems.length, - callbackExpect = !invert; - - // Go through the array, only saving the items - // that pass the validator function - for ( ; i < length; i++ ) { - callbackInverse = !callback( elems[ i ], i ); - if ( callbackInverse !== callbackExpect ) { - matches.push( elems[ i ] ); - } - } - - return matches; - }, - - // arg is for internal usage only - map: function( elems, callback, arg ) { - var length, value, - i = 0, - ret = []; - - // Go through the array, translating each of the items to their new values - if ( isArrayLike( elems ) ) { - length = elems.length; - for ( ; i < length; i++ ) { - value = callback( elems[ i ], i, arg ); - - if ( value != null ) { - ret.push( value ); - } - } - - // Go through every key on the object, - } else { - for ( i in elems ) { - value = callback( elems[ i ], i, arg ); - - if ( value != null ) { - ret.push( value ); - } - } - } - - // Flatten any nested arrays - return flat( ret ); - }, - - // A global GUID counter for objects - guid: 1, - - // jQuery.support is not used in Core but other projects attach their - // properties to it so it needs to exist. - support: support -} ); - -if ( typeof Symbol === "function" ) { - jQuery.fn[ Symbol.iterator ] = arr[ Symbol.iterator ]; -} - -// Populate the class2type map -jQuery.each( "Boolean Number String Function Array Date RegExp Object Error Symbol".split( " " ), -function( _i, name ) { - class2type[ "[object " + name + "]" ] = name.toLowerCase(); -} ); - -function isArrayLike( obj ) { - - // Support: real iOS 8.2 only (not reproducible in simulator) - // `in` check used to prevent JIT error (gh-2145) - // hasOwn isn't used here due to false negatives - // regarding Nodelist length in IE - var length = !!obj && "length" in obj && obj.length, - type = toType( obj ); - - if ( isFunction( obj ) || isWindow( obj ) ) { - return false; - } - - return type === "array" || length === 0 || - typeof length === "number" && length > 0 && ( length - 1 ) in obj; -} -var Sizzle = -/*! - * Sizzle CSS Selector Engine v2.3.5 - * https://sizzlejs.com/ - * - * Copyright JS Foundation and other contributors - * Released under the MIT license - * https://js.foundation/ - * - * Date: 2020-03-14 - */ -( function( window ) { -var i, - support, - Expr, - getText, - isXML, - tokenize, - compile, - select, - outermostContext, - sortInput, - hasDuplicate, - - // Local document vars - setDocument, - document, - docElem, - documentIsHTML, - rbuggyQSA, - rbuggyMatches, - matches, - contains, - - // Instance-specific data - expando = "sizzle" + 1 * new Date(), - preferredDoc = window.document, - dirruns = 0, - done = 0, - classCache = createCache(), - tokenCache = createCache(), - compilerCache = createCache(), - nonnativeSelectorCache = createCache(), - sortOrder = function( a, b ) { - if ( a === b ) { - hasDuplicate = true; - } - return 0; - }, - - // Instance methods - hasOwn = ( {} ).hasOwnProperty, - arr = [], - pop = arr.pop, - pushNative = arr.push, - push = arr.push, - slice = arr.slice, - - // Use a stripped-down indexOf as it's faster than native - // https://jsperf.com/thor-indexof-vs-for/5 - indexOf = function( list, elem ) { - var i = 0, - len = list.length; - for ( ; i < len; i++ ) { - if ( list[ i ] === elem ) { - return i; - } - } - return -1; - }, - - booleans = "checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|" + - "ismap|loop|multiple|open|readonly|required|scoped", - - // Regular expressions - - // http://www.w3.org/TR/css3-selectors/#whitespace - whitespace = "[\\x20\\t\\r\\n\\f]", - - // https://www.w3.org/TR/css-syntax-3/#ident-token-diagram - identifier = "(?:\\\\[\\da-fA-F]{1,6}" + whitespace + - "?|\\\\[^\\r\\n\\f]|[\\w-]|[^\0-\\x7f])+", - - // Attribute selectors: http://www.w3.org/TR/selectors/#attribute-selectors - attributes = "\\[" + whitespace + "*(" + identifier + ")(?:" + whitespace + - - // Operator (capture 2) - "*([*^$|!~]?=)" + whitespace + - - // "Attribute values must be CSS identifiers [capture 5] - // or strings [capture 3 or capture 4]" - "*(?:'((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\"|(" + identifier + "))|)" + - whitespace + "*\\]", - - pseudos = ":(" + identifier + ")(?:\\((" + - - // To reduce the number of selectors needing tokenize in the preFilter, prefer arguments: - // 1. quoted (capture 3; capture 4 or capture 5) - "('((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\")|" + - - // 2. simple (capture 6) - "((?:\\\\.|[^\\\\()[\\]]|" + attributes + ")*)|" + - - // 3. anything else (capture 2) - ".*" + - ")\\)|)", - - // Leading and non-escaped trailing whitespace, capturing some non-whitespace characters preceding the latter - rwhitespace = new RegExp( whitespace + "+", "g" ), - rtrim = new RegExp( "^" + whitespace + "+|((?:^|[^\\\\])(?:\\\\.)*)" + - whitespace + "+$", "g" ), - - rcomma = new RegExp( "^" + whitespace + "*," + whitespace + "*" ), - rcombinators = new RegExp( "^" + whitespace + "*([>+~]|" + whitespace + ")" + whitespace + - "*" ), - rdescend = new RegExp( whitespace + "|>" ), - - rpseudo = new RegExp( pseudos ), - ridentifier = new RegExp( "^" + identifier + "$" ), - - matchExpr = { - "ID": new RegExp( "^#(" + identifier + ")" ), - "CLASS": new RegExp( "^\\.(" + identifier + ")" ), - "TAG": new RegExp( "^(" + identifier + "|[*])" ), - "ATTR": new RegExp( "^" + attributes ), - "PSEUDO": new RegExp( "^" + pseudos ), - "CHILD": new RegExp( "^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\(" + - whitespace + "*(even|odd|(([+-]|)(\\d*)n|)" + whitespace + "*(?:([+-]|)" + - whitespace + "*(\\d+)|))" + whitespace + "*\\)|)", "i" ), - "bool": new RegExp( "^(?:" + booleans + ")$", "i" ), - - // For use in libraries implementing .is() - // We use this for POS matching in `select` - "needsContext": new RegExp( "^" + whitespace + - "*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\(" + whitespace + - "*((?:-\\d)?\\d*)" + whitespace + "*\\)|)(?=[^-]|$)", "i" ) - }, - - rhtml = /HTML$/i, - rinputs = /^(?:input|select|textarea|button)$/i, - rheader = /^h\d$/i, - - rnative = /^[^{]+\{\s*\[native \w/, - - // Easily-parseable/retrievable ID or TAG or CLASS selectors - rquickExpr = /^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/, - - rsibling = /[+~]/, - - // CSS escapes - // http://www.w3.org/TR/CSS21/syndata.html#escaped-characters - runescape = new RegExp( "\\\\[\\da-fA-F]{1,6}" + whitespace + "?|\\\\([^\\r\\n\\f])", "g" ), - funescape = function( escape, nonHex ) { - var high = "0x" + escape.slice( 1 ) - 0x10000; - - return nonHex ? - - // Strip the backslash prefix from a non-hex escape sequence - nonHex : - - // Replace a hexadecimal escape sequence with the encoded Unicode code point - // Support: IE <=11+ - // For values outside the Basic Multilingual Plane (BMP), manually construct a - // surrogate pair - high < 0 ? - String.fromCharCode( high + 0x10000 ) : - String.fromCharCode( high >> 10 | 0xD800, high & 0x3FF | 0xDC00 ); - }, - - // CSS string/identifier serialization - // https://drafts.csswg.org/cssom/#common-serializing-idioms - rcssescape = /([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g, - fcssescape = function( ch, asCodePoint ) { - if ( asCodePoint ) { - - // U+0000 NULL becomes U+FFFD REPLACEMENT CHARACTER - if ( ch === "\0" ) { - return "\uFFFD"; - } - - // Control characters and (dependent upon position) numbers get escaped as code points - return ch.slice( 0, -1 ) + "\\" + - ch.charCodeAt( ch.length - 1 ).toString( 16 ) + " "; - } - - // Other potentially-special ASCII characters get backslash-escaped - return "\\" + ch; - }, - - // Used for iframes - // See setDocument() - // Removing the function wrapper causes a "Permission Denied" - // error in IE - unloadHandler = function() { - setDocument(); - }, - - inDisabledFieldset = addCombinator( - function( elem ) { - return elem.disabled === true && elem.nodeName.toLowerCase() === "fieldset"; - }, - { dir: "parentNode", next: "legend" } - ); - -// Optimize for push.apply( _, NodeList ) -try { - push.apply( - ( arr = slice.call( preferredDoc.childNodes ) ), - preferredDoc.childNodes - ); - - // Support: Android<4.0 - // Detect silently failing push.apply - // eslint-disable-next-line no-unused-expressions - arr[ preferredDoc.childNodes.length ].nodeType; -} catch ( e ) { - push = { apply: arr.length ? - - // Leverage slice if possible - function( target, els ) { - pushNative.apply( target, slice.call( els ) ); - } : - - // Support: IE<9 - // Otherwise append directly - function( target, els ) { - var j = target.length, - i = 0; - - // Can't trust NodeList.length - while ( ( target[ j++ ] = els[ i++ ] ) ) {} - target.length = j - 1; - } - }; -} - -function Sizzle( selector, context, results, seed ) { - var m, i, elem, nid, match, groups, newSelector, - newContext = context && context.ownerDocument, - - // nodeType defaults to 9, since context defaults to document - nodeType = context ? context.nodeType : 9; - - results = results || []; - - // Return early from calls with invalid selector or context - if ( typeof selector !== "string" || !selector || - nodeType !== 1 && nodeType !== 9 && nodeType !== 11 ) { - - return results; - } - - // Try to shortcut find operations (as opposed to filters) in HTML documents - if ( !seed ) { - setDocument( context ); - context = context || document; - - if ( documentIsHTML ) { - - // If the selector is sufficiently simple, try using a "get*By*" DOM method - // (excepting DocumentFragment context, where the methods don't exist) - if ( nodeType !== 11 && ( match = rquickExpr.exec( selector ) ) ) { - - // ID selector - if ( ( m = match[ 1 ] ) ) { - - // Document context - if ( nodeType === 9 ) { - if ( ( elem = context.getElementById( m ) ) ) { - - // Support: IE, Opera, Webkit - // TODO: identify versions - // getElementById can match elements by name instead of ID - if ( elem.id === m ) { - results.push( elem ); - return results; - } - } else { - return results; - } - - // Element context - } else { - - // Support: IE, Opera, Webkit - // TODO: identify versions - // getElementById can match elements by name instead of ID - if ( newContext && ( elem = newContext.getElementById( m ) ) && - contains( context, elem ) && - elem.id === m ) { - - results.push( elem ); - return results; - } - } - - // Type selector - } else if ( match[ 2 ] ) { - push.apply( results, context.getElementsByTagName( selector ) ); - return results; - - // Class selector - } else if ( ( m = match[ 3 ] ) && support.getElementsByClassName && - context.getElementsByClassName ) { - - push.apply( results, context.getElementsByClassName( m ) ); - return results; - } - } - - // Take advantage of querySelectorAll - if ( support.qsa && - !nonnativeSelectorCache[ selector + " " ] && - ( !rbuggyQSA || !rbuggyQSA.test( selector ) ) && - - // Support: IE 8 only - // Exclude object elements - ( nodeType !== 1 || context.nodeName.toLowerCase() !== "object" ) ) { - - newSelector = selector; - newContext = context; - - // qSA considers elements outside a scoping root when evaluating child or - // descendant combinators, which is not what we want. - // In such cases, we work around the behavior by prefixing every selector in the - // list with an ID selector referencing the scope context. - // The technique has to be used as well when a leading combinator is used - // as such selectors are not recognized by querySelectorAll. - // Thanks to Andrew Dupont for this technique. - if ( nodeType === 1 && - ( rdescend.test( selector ) || rcombinators.test( selector ) ) ) { - - // Expand context for sibling selectors - newContext = rsibling.test( selector ) && testContext( context.parentNode ) || - context; - - // We can use :scope instead of the ID hack if the browser - // supports it & if we're not changing the context. - if ( newContext !== context || !support.scope ) { - - // Capture the context ID, setting it first if necessary - if ( ( nid = context.getAttribute( "id" ) ) ) { - nid = nid.replace( rcssescape, fcssescape ); - } else { - context.setAttribute( "id", ( nid = expando ) ); - } - } - - // Prefix every selector in the list - groups = tokenize( selector ); - i = groups.length; - while ( i-- ) { - groups[ i ] = ( nid ? "#" + nid : ":scope" ) + " " + - toSelector( groups[ i ] ); - } - newSelector = groups.join( "," ); - } - - try { - push.apply( results, - newContext.querySelectorAll( newSelector ) - ); - return results; - } catch ( qsaError ) { - nonnativeSelectorCache( selector, true ); - } finally { - if ( nid === expando ) { - context.removeAttribute( "id" ); - } - } - } - } - } - - // All others - return select( selector.replace( rtrim, "$1" ), context, results, seed ); -} - -/** - * Create key-value caches of limited size - * @returns {function(string, object)} Returns the Object data after storing it on itself with - * property name the (space-suffixed) string and (if the cache is larger than Expr.cacheLength) - * deleting the oldest entry - */ -function createCache() { - var keys = []; - - function cache( key, value ) { - - // Use (key + " ") to avoid collision with native prototype properties (see Issue #157) - if ( keys.push( key + " " ) > Expr.cacheLength ) { - - // Only keep the most recent entries - delete cache[ keys.shift() ]; - } - return ( cache[ key + " " ] = value ); - } - return cache; -} - -/** - * Mark a function for special use by Sizzle - * @param {Function} fn The function to mark - */ -function markFunction( fn ) { - fn[ expando ] = true; - return fn; -} - -/** - * Support testing using an element - * @param {Function} fn Passed the created element and returns a boolean result - */ -function assert( fn ) { - var el = document.createElement( "fieldset" ); - - try { - return !!fn( el ); - } catch ( e ) { - return false; - } finally { - - // Remove from its parent by default - if ( el.parentNode ) { - el.parentNode.removeChild( el ); - } - - // release memory in IE - el = null; - } -} - -/** - * Adds the same handler for all of the specified attrs - * @param {String} attrs Pipe-separated list of attributes - * @param {Function} handler The method that will be applied - */ -function addHandle( attrs, handler ) { - var arr = attrs.split( "|" ), - i = arr.length; - - while ( i-- ) { - Expr.attrHandle[ arr[ i ] ] = handler; - } -} - -/** - * Checks document order of two siblings - * @param {Element} a - * @param {Element} b - * @returns {Number} Returns less than 0 if a precedes b, greater than 0 if a follows b - */ -function siblingCheck( a, b ) { - var cur = b && a, - diff = cur && a.nodeType === 1 && b.nodeType === 1 && - a.sourceIndex - b.sourceIndex; - - // Use IE sourceIndex if available on both nodes - if ( diff ) { - return diff; - } - - // Check if b follows a - if ( cur ) { - while ( ( cur = cur.nextSibling ) ) { - if ( cur === b ) { - return -1; - } - } - } - - return a ? 1 : -1; -} - -/** - * Returns a function to use in pseudos for input types - * @param {String} type - */ -function createInputPseudo( type ) { - return function( elem ) { - var name = elem.nodeName.toLowerCase(); - return name === "input" && elem.type === type; - }; -} - -/** - * Returns a function to use in pseudos for buttons - * @param {String} type - */ -function createButtonPseudo( type ) { - return function( elem ) { - var name = elem.nodeName.toLowerCase(); - return ( name === "input" || name === "button" ) && elem.type === type; - }; -} - -/** - * Returns a function to use in pseudos for :enabled/:disabled - * @param {Boolean} disabled true for :disabled; false for :enabled - */ -function createDisabledPseudo( disabled ) { - - // Known :disabled false positives: fieldset[disabled] > legend:nth-of-type(n+2) :can-disable - return function( elem ) { - - // Only certain elements can match :enabled or :disabled - // https://html.spec.whatwg.org/multipage/scripting.html#selector-enabled - // https://html.spec.whatwg.org/multipage/scripting.html#selector-disabled - if ( "form" in elem ) { - - // Check for inherited disabledness on relevant non-disabled elements: - // * listed form-associated elements in a disabled fieldset - // https://html.spec.whatwg.org/multipage/forms.html#category-listed - // https://html.spec.whatwg.org/multipage/forms.html#concept-fe-disabled - // * option elements in a disabled optgroup - // https://html.spec.whatwg.org/multipage/forms.html#concept-option-disabled - // All such elements have a "form" property. - if ( elem.parentNode && elem.disabled === false ) { - - // Option elements defer to a parent optgroup if present - if ( "label" in elem ) { - if ( "label" in elem.parentNode ) { - return elem.parentNode.disabled === disabled; - } else { - return elem.disabled === disabled; - } - } - - // Support: IE 6 - 11 - // Use the isDisabled shortcut property to check for disabled fieldset ancestors - return elem.isDisabled === disabled || - - // Where there is no isDisabled, check manually - /* jshint -W018 */ - elem.isDisabled !== !disabled && - inDisabledFieldset( elem ) === disabled; - } - - return elem.disabled === disabled; - - // Try to winnow out elements that can't be disabled before trusting the disabled property. - // Some victims get caught in our net (label, legend, menu, track), but it shouldn't - // even exist on them, let alone have a boolean value. - } else if ( "label" in elem ) { - return elem.disabled === disabled; - } - - // Remaining elements are neither :enabled nor :disabled - return false; - }; -} - -/** - * Returns a function to use in pseudos for positionals - * @param {Function} fn - */ -function createPositionalPseudo( fn ) { - return markFunction( function( argument ) { - argument = +argument; - return markFunction( function( seed, matches ) { - var j, - matchIndexes = fn( [], seed.length, argument ), - i = matchIndexes.length; - - // Match elements found at the specified indexes - while ( i-- ) { - if ( seed[ ( j = matchIndexes[ i ] ) ] ) { - seed[ j ] = !( matches[ j ] = seed[ j ] ); - } - } - } ); - } ); -} - -/** - * Checks a node for validity as a Sizzle context - * @param {Element|Object=} context - * @returns {Element|Object|Boolean} The input node if acceptable, otherwise a falsy value - */ -function testContext( context ) { - return context && typeof context.getElementsByTagName !== "undefined" && context; -} - -// Expose support vars for convenience -support = Sizzle.support = {}; - -/** - * Detects XML nodes - * @param {Element|Object} elem An element or a document - * @returns {Boolean} True iff elem is a non-HTML XML node - */ -isXML = Sizzle.isXML = function( elem ) { - var namespace = elem.namespaceURI, - docElem = ( elem.ownerDocument || elem ).documentElement; - - // Support: IE <=8 - // Assume HTML when documentElement doesn't yet exist, such as inside loading iframes - // https://bugs.jquery.com/ticket/4833 - return !rhtml.test( namespace || docElem && docElem.nodeName || "HTML" ); -}; - -/** - * Sets document-related variables once based on the current document - * @param {Element|Object} [doc] An element or document object to use to set the document - * @returns {Object} Returns the current document - */ -setDocument = Sizzle.setDocument = function( node ) { - var hasCompare, subWindow, - doc = node ? node.ownerDocument || node : preferredDoc; - - // Return early if doc is invalid or already selected - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - // eslint-disable-next-line eqeqeq - if ( doc == document || doc.nodeType !== 9 || !doc.documentElement ) { - return document; - } - - // Update global variables - document = doc; - docElem = document.documentElement; - documentIsHTML = !isXML( document ); - - // Support: IE 9 - 11+, Edge 12 - 18+ - // Accessing iframe documents after unload throws "permission denied" errors (jQuery #13936) - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - // eslint-disable-next-line eqeqeq - if ( preferredDoc != document && - ( subWindow = document.defaultView ) && subWindow.top !== subWindow ) { - - // Support: IE 11, Edge - if ( subWindow.addEventListener ) { - subWindow.addEventListener( "unload", unloadHandler, false ); - - // Support: IE 9 - 10 only - } else if ( subWindow.attachEvent ) { - subWindow.attachEvent( "onunload", unloadHandler ); - } - } - - // Support: IE 8 - 11+, Edge 12 - 18+, Chrome <=16 - 25 only, Firefox <=3.6 - 31 only, - // Safari 4 - 5 only, Opera <=11.6 - 12.x only - // IE/Edge & older browsers don't support the :scope pseudo-class. - // Support: Safari 6.0 only - // Safari 6.0 supports :scope but it's an alias of :root there. - support.scope = assert( function( el ) { - docElem.appendChild( el ).appendChild( document.createElement( "div" ) ); - return typeof el.querySelectorAll !== "undefined" && - !el.querySelectorAll( ":scope fieldset div" ).length; - } ); - - /* Attributes - ---------------------------------------------------------------------- */ - - // Support: IE<8 - // Verify that getAttribute really returns attributes and not properties - // (excepting IE8 booleans) - support.attributes = assert( function( el ) { - el.className = "i"; - return !el.getAttribute( "className" ); - } ); - - /* getElement(s)By* - ---------------------------------------------------------------------- */ - - // Check if getElementsByTagName("*") returns only elements - support.getElementsByTagName = assert( function( el ) { - el.appendChild( document.createComment( "" ) ); - return !el.getElementsByTagName( "*" ).length; - } ); - - // Support: IE<9 - support.getElementsByClassName = rnative.test( document.getElementsByClassName ); - - // Support: IE<10 - // Check if getElementById returns elements by name - // The broken getElementById methods don't pick up programmatically-set names, - // so use a roundabout getElementsByName test - support.getById = assert( function( el ) { - docElem.appendChild( el ).id = expando; - return !document.getElementsByName || !document.getElementsByName( expando ).length; - } ); - - // ID filter and find - if ( support.getById ) { - Expr.filter[ "ID" ] = function( id ) { - var attrId = id.replace( runescape, funescape ); - return function( elem ) { - return elem.getAttribute( "id" ) === attrId; - }; - }; - Expr.find[ "ID" ] = function( id, context ) { - if ( typeof context.getElementById !== "undefined" && documentIsHTML ) { - var elem = context.getElementById( id ); - return elem ? [ elem ] : []; - } - }; - } else { - Expr.filter[ "ID" ] = function( id ) { - var attrId = id.replace( runescape, funescape ); - return function( elem ) { - var node = typeof elem.getAttributeNode !== "undefined" && - elem.getAttributeNode( "id" ); - return node && node.value === attrId; - }; - }; - - // Support: IE 6 - 7 only - // getElementById is not reliable as a find shortcut - Expr.find[ "ID" ] = function( id, context ) { - if ( typeof context.getElementById !== "undefined" && documentIsHTML ) { - var node, i, elems, - elem = context.getElementById( id ); - - if ( elem ) { - - // Verify the id attribute - node = elem.getAttributeNode( "id" ); - if ( node && node.value === id ) { - return [ elem ]; - } - - // Fall back on getElementsByName - elems = context.getElementsByName( id ); - i = 0; - while ( ( elem = elems[ i++ ] ) ) { - node = elem.getAttributeNode( "id" ); - if ( node && node.value === id ) { - return [ elem ]; - } - } - } - - return []; - } - }; - } - - // Tag - Expr.find[ "TAG" ] = support.getElementsByTagName ? - function( tag, context ) { - if ( typeof context.getElementsByTagName !== "undefined" ) { - return context.getElementsByTagName( tag ); - - // DocumentFragment nodes don't have gEBTN - } else if ( support.qsa ) { - return context.querySelectorAll( tag ); - } - } : - - function( tag, context ) { - var elem, - tmp = [], - i = 0, - - // By happy coincidence, a (broken) gEBTN appears on DocumentFragment nodes too - results = context.getElementsByTagName( tag ); - - // Filter out possible comments - if ( tag === "*" ) { - while ( ( elem = results[ i++ ] ) ) { - if ( elem.nodeType === 1 ) { - tmp.push( elem ); - } - } - - return tmp; - } - return results; - }; - - // Class - Expr.find[ "CLASS" ] = support.getElementsByClassName && function( className, context ) { - if ( typeof context.getElementsByClassName !== "undefined" && documentIsHTML ) { - return context.getElementsByClassName( className ); - } - }; - - /* QSA/matchesSelector - ---------------------------------------------------------------------- */ - - // QSA and matchesSelector support - - // matchesSelector(:active) reports false when true (IE9/Opera 11.5) - rbuggyMatches = []; - - // qSa(:focus) reports false when true (Chrome 21) - // We allow this because of a bug in IE8/9 that throws an error - // whenever `document.activeElement` is accessed on an iframe - // So, we allow :focus to pass through QSA all the time to avoid the IE error - // See https://bugs.jquery.com/ticket/13378 - rbuggyQSA = []; - - if ( ( support.qsa = rnative.test( document.querySelectorAll ) ) ) { - - // Build QSA regex - // Regex strategy adopted from Diego Perini - assert( function( el ) { - - var input; - - // Select is set to empty string on purpose - // This is to test IE's treatment of not explicitly - // setting a boolean content attribute, - // since its presence should be enough - // https://bugs.jquery.com/ticket/12359 - docElem.appendChild( el ).innerHTML = "" + - ""; - - // Support: IE8, Opera 11-12.16 - // Nothing should be selected when empty strings follow ^= or $= or *= - // The test attribute must be unknown in Opera but "safe" for WinRT - // https://msdn.microsoft.com/en-us/library/ie/hh465388.aspx#attribute_section - if ( el.querySelectorAll( "[msallowcapture^='']" ).length ) { - rbuggyQSA.push( "[*^$]=" + whitespace + "*(?:''|\"\")" ); - } - - // Support: IE8 - // Boolean attributes and "value" are not treated correctly - if ( !el.querySelectorAll( "[selected]" ).length ) { - rbuggyQSA.push( "\\[" + whitespace + "*(?:value|" + booleans + ")" ); - } - - // Support: Chrome<29, Android<4.4, Safari<7.0+, iOS<7.0+, PhantomJS<1.9.8+ - if ( !el.querySelectorAll( "[id~=" + expando + "-]" ).length ) { - rbuggyQSA.push( "~=" ); - } - - // Support: IE 11+, Edge 15 - 18+ - // IE 11/Edge don't find elements on a `[name='']` query in some cases. - // Adding a temporary attribute to the document before the selection works - // around the issue. - // Interestingly, IE 10 & older don't seem to have the issue. - input = document.createElement( "input" ); - input.setAttribute( "name", "" ); - el.appendChild( input ); - if ( !el.querySelectorAll( "[name='']" ).length ) { - rbuggyQSA.push( "\\[" + whitespace + "*name" + whitespace + "*=" + - whitespace + "*(?:''|\"\")" ); - } - - // Webkit/Opera - :checked should return selected option elements - // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked - // IE8 throws error here and will not see later tests - if ( !el.querySelectorAll( ":checked" ).length ) { - rbuggyQSA.push( ":checked" ); - } - - // Support: Safari 8+, iOS 8+ - // https://bugs.webkit.org/show_bug.cgi?id=136851 - // In-page `selector#id sibling-combinator selector` fails - if ( !el.querySelectorAll( "a#" + expando + "+*" ).length ) { - rbuggyQSA.push( ".#.+[+~]" ); - } - - // Support: Firefox <=3.6 - 5 only - // Old Firefox doesn't throw on a badly-escaped identifier. - el.querySelectorAll( "\\\f" ); - rbuggyQSA.push( "[\\r\\n\\f]" ); - } ); - - assert( function( el ) { - el.innerHTML = "" + - ""; - - // Support: Windows 8 Native Apps - // The type and name attributes are restricted during .innerHTML assignment - var input = document.createElement( "input" ); - input.setAttribute( "type", "hidden" ); - el.appendChild( input ).setAttribute( "name", "D" ); - - // Support: IE8 - // Enforce case-sensitivity of name attribute - if ( el.querySelectorAll( "[name=d]" ).length ) { - rbuggyQSA.push( "name" + whitespace + "*[*^$|!~]?=" ); - } - - // FF 3.5 - :enabled/:disabled and hidden elements (hidden elements are still enabled) - // IE8 throws error here and will not see later tests - if ( el.querySelectorAll( ":enabled" ).length !== 2 ) { - rbuggyQSA.push( ":enabled", ":disabled" ); - } - - // Support: IE9-11+ - // IE's :disabled selector does not pick up the children of disabled fieldsets - docElem.appendChild( el ).disabled = true; - if ( el.querySelectorAll( ":disabled" ).length !== 2 ) { - rbuggyQSA.push( ":enabled", ":disabled" ); - } - - // Support: Opera 10 - 11 only - // Opera 10-11 does not throw on post-comma invalid pseudos - el.querySelectorAll( "*,:x" ); - rbuggyQSA.push( ",.*:" ); - } ); - } - - if ( ( support.matchesSelector = rnative.test( ( matches = docElem.matches || - docElem.webkitMatchesSelector || - docElem.mozMatchesSelector || - docElem.oMatchesSelector || - docElem.msMatchesSelector ) ) ) ) { - - assert( function( el ) { - - // Check to see if it's possible to do matchesSelector - // on a disconnected node (IE 9) - support.disconnectedMatch = matches.call( el, "*" ); - - // This should fail with an exception - // Gecko does not error, returns false instead - matches.call( el, "[s!='']:x" ); - rbuggyMatches.push( "!=", pseudos ); - } ); - } - - rbuggyQSA = rbuggyQSA.length && new RegExp( rbuggyQSA.join( "|" ) ); - rbuggyMatches = rbuggyMatches.length && new RegExp( rbuggyMatches.join( "|" ) ); - - /* Contains - ---------------------------------------------------------------------- */ - hasCompare = rnative.test( docElem.compareDocumentPosition ); - - // Element contains another - // Purposefully self-exclusive - // As in, an element does not contain itself - contains = hasCompare || rnative.test( docElem.contains ) ? - function( a, b ) { - var adown = a.nodeType === 9 ? a.documentElement : a, - bup = b && b.parentNode; - return a === bup || !!( bup && bup.nodeType === 1 && ( - adown.contains ? - adown.contains( bup ) : - a.compareDocumentPosition && a.compareDocumentPosition( bup ) & 16 - ) ); - } : - function( a, b ) { - if ( b ) { - while ( ( b = b.parentNode ) ) { - if ( b === a ) { - return true; - } - } - } - return false; - }; - - /* Sorting - ---------------------------------------------------------------------- */ - - // Document order sorting - sortOrder = hasCompare ? - function( a, b ) { - - // Flag for duplicate removal - if ( a === b ) { - hasDuplicate = true; - return 0; - } - - // Sort on method existence if only one input has compareDocumentPosition - var compare = !a.compareDocumentPosition - !b.compareDocumentPosition; - if ( compare ) { - return compare; - } - - // Calculate position if both inputs belong to the same document - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - // eslint-disable-next-line eqeqeq - compare = ( a.ownerDocument || a ) == ( b.ownerDocument || b ) ? - a.compareDocumentPosition( b ) : - - // Otherwise we know they are disconnected - 1; - - // Disconnected nodes - if ( compare & 1 || - ( !support.sortDetached && b.compareDocumentPosition( a ) === compare ) ) { - - // Choose the first element that is related to our preferred document - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - // eslint-disable-next-line eqeqeq - if ( a == document || a.ownerDocument == preferredDoc && - contains( preferredDoc, a ) ) { - return -1; - } - - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - // eslint-disable-next-line eqeqeq - if ( b == document || b.ownerDocument == preferredDoc && - contains( preferredDoc, b ) ) { - return 1; - } - - // Maintain original order - return sortInput ? - ( indexOf( sortInput, a ) - indexOf( sortInput, b ) ) : - 0; - } - - return compare & 4 ? -1 : 1; - } : - function( a, b ) { - - // Exit early if the nodes are identical - if ( a === b ) { - hasDuplicate = true; - return 0; - } - - var cur, - i = 0, - aup = a.parentNode, - bup = b.parentNode, - ap = [ a ], - bp = [ b ]; - - // Parentless nodes are either documents or disconnected - if ( !aup || !bup ) { - - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - /* eslint-disable eqeqeq */ - return a == document ? -1 : - b == document ? 1 : - /* eslint-enable eqeqeq */ - aup ? -1 : - bup ? 1 : - sortInput ? - ( indexOf( sortInput, a ) - indexOf( sortInput, b ) ) : - 0; - - // If the nodes are siblings, we can do a quick check - } else if ( aup === bup ) { - return siblingCheck( a, b ); - } - - // Otherwise we need full lists of their ancestors for comparison - cur = a; - while ( ( cur = cur.parentNode ) ) { - ap.unshift( cur ); - } - cur = b; - while ( ( cur = cur.parentNode ) ) { - bp.unshift( cur ); - } - - // Walk down the tree looking for a discrepancy - while ( ap[ i ] === bp[ i ] ) { - i++; - } - - return i ? - - // Do a sibling check if the nodes have a common ancestor - siblingCheck( ap[ i ], bp[ i ] ) : - - // Otherwise nodes in our document sort first - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - /* eslint-disable eqeqeq */ - ap[ i ] == preferredDoc ? -1 : - bp[ i ] == preferredDoc ? 1 : - /* eslint-enable eqeqeq */ - 0; - }; - - return document; -}; - -Sizzle.matches = function( expr, elements ) { - return Sizzle( expr, null, null, elements ); -}; - -Sizzle.matchesSelector = function( elem, expr ) { - setDocument( elem ); - - if ( support.matchesSelector && documentIsHTML && - !nonnativeSelectorCache[ expr + " " ] && - ( !rbuggyMatches || !rbuggyMatches.test( expr ) ) && - ( !rbuggyQSA || !rbuggyQSA.test( expr ) ) ) { - - try { - var ret = matches.call( elem, expr ); - - // IE 9's matchesSelector returns false on disconnected nodes - if ( ret || support.disconnectedMatch || - - // As well, disconnected nodes are said to be in a document - // fragment in IE 9 - elem.document && elem.document.nodeType !== 11 ) { - return ret; - } - } catch ( e ) { - nonnativeSelectorCache( expr, true ); - } - } - - return Sizzle( expr, document, null, [ elem ] ).length > 0; -}; - -Sizzle.contains = function( context, elem ) { - - // Set document vars if needed - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - // eslint-disable-next-line eqeqeq - if ( ( context.ownerDocument || context ) != document ) { - setDocument( context ); - } - return contains( context, elem ); -}; - -Sizzle.attr = function( elem, name ) { - - // Set document vars if needed - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - // eslint-disable-next-line eqeqeq - if ( ( elem.ownerDocument || elem ) != document ) { - setDocument( elem ); - } - - var fn = Expr.attrHandle[ name.toLowerCase() ], - - // Don't get fooled by Object.prototype properties (jQuery #13807) - val = fn && hasOwn.call( Expr.attrHandle, name.toLowerCase() ) ? - fn( elem, name, !documentIsHTML ) : - undefined; - - return val !== undefined ? - val : - support.attributes || !documentIsHTML ? - elem.getAttribute( name ) : - ( val = elem.getAttributeNode( name ) ) && val.specified ? - val.value : - null; -}; - -Sizzle.escape = function( sel ) { - return ( sel + "" ).replace( rcssescape, fcssescape ); -}; - -Sizzle.error = function( msg ) { - throw new Error( "Syntax error, unrecognized expression: " + msg ); -}; - -/** - * Document sorting and removing duplicates - * @param {ArrayLike} results - */ -Sizzle.uniqueSort = function( results ) { - var elem, - duplicates = [], - j = 0, - i = 0; - - // Unless we *know* we can detect duplicates, assume their presence - hasDuplicate = !support.detectDuplicates; - sortInput = !support.sortStable && results.slice( 0 ); - results.sort( sortOrder ); - - if ( hasDuplicate ) { - while ( ( elem = results[ i++ ] ) ) { - if ( elem === results[ i ] ) { - j = duplicates.push( i ); - } - } - while ( j-- ) { - results.splice( duplicates[ j ], 1 ); - } - } - - // Clear input after sorting to release objects - // See https://github.com/jquery/sizzle/pull/225 - sortInput = null; - - return results; -}; - -/** - * Utility function for retrieving the text value of an array of DOM nodes - * @param {Array|Element} elem - */ -getText = Sizzle.getText = function( elem ) { - var node, - ret = "", - i = 0, - nodeType = elem.nodeType; - - if ( !nodeType ) { - - // If no nodeType, this is expected to be an array - while ( ( node = elem[ i++ ] ) ) { - - // Do not traverse comment nodes - ret += getText( node ); - } - } else if ( nodeType === 1 || nodeType === 9 || nodeType === 11 ) { - - // Use textContent for elements - // innerText usage removed for consistency of new lines (jQuery #11153) - if ( typeof elem.textContent === "string" ) { - return elem.textContent; - } else { - - // Traverse its children - for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { - ret += getText( elem ); - } - } - } else if ( nodeType === 3 || nodeType === 4 ) { - return elem.nodeValue; - } - - // Do not include comment or processing instruction nodes - - return ret; -}; - -Expr = Sizzle.selectors = { - - // Can be adjusted by the user - cacheLength: 50, - - createPseudo: markFunction, - - match: matchExpr, - - attrHandle: {}, - - find: {}, - - relative: { - ">": { dir: "parentNode", first: true }, - " ": { dir: "parentNode" }, - "+": { dir: "previousSibling", first: true }, - "~": { dir: "previousSibling" } - }, - - preFilter: { - "ATTR": function( match ) { - match[ 1 ] = match[ 1 ].replace( runescape, funescape ); - - // Move the given value to match[3] whether quoted or unquoted - match[ 3 ] = ( match[ 3 ] || match[ 4 ] || - match[ 5 ] || "" ).replace( runescape, funescape ); - - if ( match[ 2 ] === "~=" ) { - match[ 3 ] = " " + match[ 3 ] + " "; - } - - return match.slice( 0, 4 ); - }, - - "CHILD": function( match ) { - - /* matches from matchExpr["CHILD"] - 1 type (only|nth|...) - 2 what (child|of-type) - 3 argument (even|odd|\d*|\d*n([+-]\d+)?|...) - 4 xn-component of xn+y argument ([+-]?\d*n|) - 5 sign of xn-component - 6 x of xn-component - 7 sign of y-component - 8 y of y-component - */ - match[ 1 ] = match[ 1 ].toLowerCase(); - - if ( match[ 1 ].slice( 0, 3 ) === "nth" ) { - - // nth-* requires argument - if ( !match[ 3 ] ) { - Sizzle.error( match[ 0 ] ); - } - - // numeric x and y parameters for Expr.filter.CHILD - // remember that false/true cast respectively to 0/1 - match[ 4 ] = +( match[ 4 ] ? - match[ 5 ] + ( match[ 6 ] || 1 ) : - 2 * ( match[ 3 ] === "even" || match[ 3 ] === "odd" ) ); - match[ 5 ] = +( ( match[ 7 ] + match[ 8 ] ) || match[ 3 ] === "odd" ); - - // other types prohibit arguments - } else if ( match[ 3 ] ) { - Sizzle.error( match[ 0 ] ); - } - - return match; - }, - - "PSEUDO": function( match ) { - var excess, - unquoted = !match[ 6 ] && match[ 2 ]; - - if ( matchExpr[ "CHILD" ].test( match[ 0 ] ) ) { - return null; - } - - // Accept quoted arguments as-is - if ( match[ 3 ] ) { - match[ 2 ] = match[ 4 ] || match[ 5 ] || ""; - - // Strip excess characters from unquoted arguments - } else if ( unquoted && rpseudo.test( unquoted ) && - - // Get excess from tokenize (recursively) - ( excess = tokenize( unquoted, true ) ) && - - // advance to the next closing parenthesis - ( excess = unquoted.indexOf( ")", unquoted.length - excess ) - unquoted.length ) ) { - - // excess is a negative index - match[ 0 ] = match[ 0 ].slice( 0, excess ); - match[ 2 ] = unquoted.slice( 0, excess ); - } - - // Return only captures needed by the pseudo filter method (type and argument) - return match.slice( 0, 3 ); - } - }, - - filter: { - - "TAG": function( nodeNameSelector ) { - var nodeName = nodeNameSelector.replace( runescape, funescape ).toLowerCase(); - return nodeNameSelector === "*" ? - function() { - return true; - } : - function( elem ) { - return elem.nodeName && elem.nodeName.toLowerCase() === nodeName; - }; - }, - - "CLASS": function( className ) { - var pattern = classCache[ className + " " ]; - - return pattern || - ( pattern = new RegExp( "(^|" + whitespace + - ")" + className + "(" + whitespace + "|$)" ) ) && classCache( - className, function( elem ) { - return pattern.test( - typeof elem.className === "string" && elem.className || - typeof elem.getAttribute !== "undefined" && - elem.getAttribute( "class" ) || - "" - ); - } ); - }, - - "ATTR": function( name, operator, check ) { - return function( elem ) { - var result = Sizzle.attr( elem, name ); - - if ( result == null ) { - return operator === "!="; - } - if ( !operator ) { - return true; - } - - result += ""; - - /* eslint-disable max-len */ - - return operator === "=" ? result === check : - operator === "!=" ? result !== check : - operator === "^=" ? check && result.indexOf( check ) === 0 : - operator === "*=" ? check && result.indexOf( check ) > -1 : - operator === "$=" ? check && result.slice( -check.length ) === check : - operator === "~=" ? ( " " + result.replace( rwhitespace, " " ) + " " ).indexOf( check ) > -1 : - operator === "|=" ? result === check || result.slice( 0, check.length + 1 ) === check + "-" : - false; - /* eslint-enable max-len */ - - }; - }, - - "CHILD": function( type, what, _argument, first, last ) { - var simple = type.slice( 0, 3 ) !== "nth", - forward = type.slice( -4 ) !== "last", - ofType = what === "of-type"; - - return first === 1 && last === 0 ? - - // Shortcut for :nth-*(n) - function( elem ) { - return !!elem.parentNode; - } : - - function( elem, _context, xml ) { - var cache, uniqueCache, outerCache, node, nodeIndex, start, - dir = simple !== forward ? "nextSibling" : "previousSibling", - parent = elem.parentNode, - name = ofType && elem.nodeName.toLowerCase(), - useCache = !xml && !ofType, - diff = false; - - if ( parent ) { - - // :(first|last|only)-(child|of-type) - if ( simple ) { - while ( dir ) { - node = elem; - while ( ( node = node[ dir ] ) ) { - if ( ofType ? - node.nodeName.toLowerCase() === name : - node.nodeType === 1 ) { - - return false; - } - } - - // Reverse direction for :only-* (if we haven't yet done so) - start = dir = type === "only" && !start && "nextSibling"; - } - return true; - } - - start = [ forward ? parent.firstChild : parent.lastChild ]; - - // non-xml :nth-child(...) stores cache data on `parent` - if ( forward && useCache ) { - - // Seek `elem` from a previously-cached index - - // ...in a gzip-friendly way - node = parent; - outerCache = node[ expando ] || ( node[ expando ] = {} ); - - // Support: IE <9 only - // Defend against cloned attroperties (jQuery gh-1709) - uniqueCache = outerCache[ node.uniqueID ] || - ( outerCache[ node.uniqueID ] = {} ); - - cache = uniqueCache[ type ] || []; - nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ]; - diff = nodeIndex && cache[ 2 ]; - node = nodeIndex && parent.childNodes[ nodeIndex ]; - - while ( ( node = ++nodeIndex && node && node[ dir ] || - - // Fallback to seeking `elem` from the start - ( diff = nodeIndex = 0 ) || start.pop() ) ) { - - // When found, cache indexes on `parent` and break - if ( node.nodeType === 1 && ++diff && node === elem ) { - uniqueCache[ type ] = [ dirruns, nodeIndex, diff ]; - break; - } - } - - } else { - - // Use previously-cached element index if available - if ( useCache ) { - - // ...in a gzip-friendly way - node = elem; - outerCache = node[ expando ] || ( node[ expando ] = {} ); - - // Support: IE <9 only - // Defend against cloned attroperties (jQuery gh-1709) - uniqueCache = outerCache[ node.uniqueID ] || - ( outerCache[ node.uniqueID ] = {} ); - - cache = uniqueCache[ type ] || []; - nodeIndex = cache[ 0 ] === dirruns && cache[ 1 ]; - diff = nodeIndex; - } - - // xml :nth-child(...) - // or :nth-last-child(...) or :nth(-last)?-of-type(...) - if ( diff === false ) { - - // Use the same loop as above to seek `elem` from the start - while ( ( node = ++nodeIndex && node && node[ dir ] || - ( diff = nodeIndex = 0 ) || start.pop() ) ) { - - if ( ( ofType ? - node.nodeName.toLowerCase() === name : - node.nodeType === 1 ) && - ++diff ) { - - // Cache the index of each encountered element - if ( useCache ) { - outerCache = node[ expando ] || - ( node[ expando ] = {} ); - - // Support: IE <9 only - // Defend against cloned attroperties (jQuery gh-1709) - uniqueCache = outerCache[ node.uniqueID ] || - ( outerCache[ node.uniqueID ] = {} ); - - uniqueCache[ type ] = [ dirruns, diff ]; - } - - if ( node === elem ) { - break; - } - } - } - } - } - - // Incorporate the offset, then check against cycle size - diff -= last; - return diff === first || ( diff % first === 0 && diff / first >= 0 ); - } - }; - }, - - "PSEUDO": function( pseudo, argument ) { - - // pseudo-class names are case-insensitive - // http://www.w3.org/TR/selectors/#pseudo-classes - // Prioritize by case sensitivity in case custom pseudos are added with uppercase letters - // Remember that setFilters inherits from pseudos - var args, - fn = Expr.pseudos[ pseudo ] || Expr.setFilters[ pseudo.toLowerCase() ] || - Sizzle.error( "unsupported pseudo: " + pseudo ); - - // The user may use createPseudo to indicate that - // arguments are needed to create the filter function - // just as Sizzle does - if ( fn[ expando ] ) { - return fn( argument ); - } - - // But maintain support for old signatures - if ( fn.length > 1 ) { - args = [ pseudo, pseudo, "", argument ]; - return Expr.setFilters.hasOwnProperty( pseudo.toLowerCase() ) ? - markFunction( function( seed, matches ) { - var idx, - matched = fn( seed, argument ), - i = matched.length; - while ( i-- ) { - idx = indexOf( seed, matched[ i ] ); - seed[ idx ] = !( matches[ idx ] = matched[ i ] ); - } - } ) : - function( elem ) { - return fn( elem, 0, args ); - }; - } - - return fn; - } - }, - - pseudos: { - - // Potentially complex pseudos - "not": markFunction( function( selector ) { - - // Trim the selector passed to compile - // to avoid treating leading and trailing - // spaces as combinators - var input = [], - results = [], - matcher = compile( selector.replace( rtrim, "$1" ) ); - - return matcher[ expando ] ? - markFunction( function( seed, matches, _context, xml ) { - var elem, - unmatched = matcher( seed, null, xml, [] ), - i = seed.length; - - // Match elements unmatched by `matcher` - while ( i-- ) { - if ( ( elem = unmatched[ i ] ) ) { - seed[ i ] = !( matches[ i ] = elem ); - } - } - } ) : - function( elem, _context, xml ) { - input[ 0 ] = elem; - matcher( input, null, xml, results ); - - // Don't keep the element (issue #299) - input[ 0 ] = null; - return !results.pop(); - }; - } ), - - "has": markFunction( function( selector ) { - return function( elem ) { - return Sizzle( selector, elem ).length > 0; - }; - } ), - - "contains": markFunction( function( text ) { - text = text.replace( runescape, funescape ); - return function( elem ) { - return ( elem.textContent || getText( elem ) ).indexOf( text ) > -1; - }; - } ), - - // "Whether an element is represented by a :lang() selector - // is based solely on the element's language value - // being equal to the identifier C, - // or beginning with the identifier C immediately followed by "-". - // The matching of C against the element's language value is performed case-insensitively. - // The identifier C does not have to be a valid language name." - // http://www.w3.org/TR/selectors/#lang-pseudo - "lang": markFunction( function( lang ) { - - // lang value must be a valid identifier - if ( !ridentifier.test( lang || "" ) ) { - Sizzle.error( "unsupported lang: " + lang ); - } - lang = lang.replace( runescape, funescape ).toLowerCase(); - return function( elem ) { - var elemLang; - do { - if ( ( elemLang = documentIsHTML ? - elem.lang : - elem.getAttribute( "xml:lang" ) || elem.getAttribute( "lang" ) ) ) { - - elemLang = elemLang.toLowerCase(); - return elemLang === lang || elemLang.indexOf( lang + "-" ) === 0; - } - } while ( ( elem = elem.parentNode ) && elem.nodeType === 1 ); - return false; - }; - } ), - - // Miscellaneous - "target": function( elem ) { - var hash = window.location && window.location.hash; - return hash && hash.slice( 1 ) === elem.id; - }, - - "root": function( elem ) { - return elem === docElem; - }, - - "focus": function( elem ) { - return elem === document.activeElement && - ( !document.hasFocus || document.hasFocus() ) && - !!( elem.type || elem.href || ~elem.tabIndex ); - }, - - // Boolean properties - "enabled": createDisabledPseudo( false ), - "disabled": createDisabledPseudo( true ), - - "checked": function( elem ) { - - // In CSS3, :checked should return both checked and selected elements - // http://www.w3.org/TR/2011/REC-css3-selectors-20110929/#checked - var nodeName = elem.nodeName.toLowerCase(); - return ( nodeName === "input" && !!elem.checked ) || - ( nodeName === "option" && !!elem.selected ); - }, - - "selected": function( elem ) { - - // Accessing this property makes selected-by-default - // options in Safari work properly - if ( elem.parentNode ) { - // eslint-disable-next-line no-unused-expressions - elem.parentNode.selectedIndex; - } - - return elem.selected === true; - }, - - // Contents - "empty": function( elem ) { - - // http://www.w3.org/TR/selectors/#empty-pseudo - // :empty is negated by element (1) or content nodes (text: 3; cdata: 4; entity ref: 5), - // but not by others (comment: 8; processing instruction: 7; etc.) - // nodeType < 6 works because attributes (2) do not appear as children - for ( elem = elem.firstChild; elem; elem = elem.nextSibling ) { - if ( elem.nodeType < 6 ) { - return false; - } - } - return true; - }, - - "parent": function( elem ) { - return !Expr.pseudos[ "empty" ]( elem ); - }, - - // Element/input types - "header": function( elem ) { - return rheader.test( elem.nodeName ); - }, - - "input": function( elem ) { - return rinputs.test( elem.nodeName ); - }, - - "button": function( elem ) { - var name = elem.nodeName.toLowerCase(); - return name === "input" && elem.type === "button" || name === "button"; - }, - - "text": function( elem ) { - var attr; - return elem.nodeName.toLowerCase() === "input" && - elem.type === "text" && - - // Support: IE<8 - // New HTML5 attribute values (e.g., "search") appear with elem.type === "text" - ( ( attr = elem.getAttribute( "type" ) ) == null || - attr.toLowerCase() === "text" ); - }, - - // Position-in-collection - "first": createPositionalPseudo( function() { - return [ 0 ]; - } ), - - "last": createPositionalPseudo( function( _matchIndexes, length ) { - return [ length - 1 ]; - } ), - - "eq": createPositionalPseudo( function( _matchIndexes, length, argument ) { - return [ argument < 0 ? argument + length : argument ]; - } ), - - "even": createPositionalPseudo( function( matchIndexes, length ) { - var i = 0; - for ( ; i < length; i += 2 ) { - matchIndexes.push( i ); - } - return matchIndexes; - } ), - - "odd": createPositionalPseudo( function( matchIndexes, length ) { - var i = 1; - for ( ; i < length; i += 2 ) { - matchIndexes.push( i ); - } - return matchIndexes; - } ), - - "lt": createPositionalPseudo( function( matchIndexes, length, argument ) { - var i = argument < 0 ? - argument + length : - argument > length ? - length : - argument; - for ( ; --i >= 0; ) { - matchIndexes.push( i ); - } - return matchIndexes; - } ), - - "gt": createPositionalPseudo( function( matchIndexes, length, argument ) { - var i = argument < 0 ? argument + length : argument; - for ( ; ++i < length; ) { - matchIndexes.push( i ); - } - return matchIndexes; - } ) - } -}; - -Expr.pseudos[ "nth" ] = Expr.pseudos[ "eq" ]; - -// Add button/input type pseudos -for ( i in { radio: true, checkbox: true, file: true, password: true, image: true } ) { - Expr.pseudos[ i ] = createInputPseudo( i ); -} -for ( i in { submit: true, reset: true } ) { - Expr.pseudos[ i ] = createButtonPseudo( i ); -} - -// Easy API for creating new setFilters -function setFilters() {} -setFilters.prototype = Expr.filters = Expr.pseudos; -Expr.setFilters = new setFilters(); - -tokenize = Sizzle.tokenize = function( selector, parseOnly ) { - var matched, match, tokens, type, - soFar, groups, preFilters, - cached = tokenCache[ selector + " " ]; - - if ( cached ) { - return parseOnly ? 0 : cached.slice( 0 ); - } - - soFar = selector; - groups = []; - preFilters = Expr.preFilter; - - while ( soFar ) { - - // Comma and first run - if ( !matched || ( match = rcomma.exec( soFar ) ) ) { - if ( match ) { - - // Don't consume trailing commas as valid - soFar = soFar.slice( match[ 0 ].length ) || soFar; - } - groups.push( ( tokens = [] ) ); - } - - matched = false; - - // Combinators - if ( ( match = rcombinators.exec( soFar ) ) ) { - matched = match.shift(); - tokens.push( { - value: matched, - - // Cast descendant combinators to space - type: match[ 0 ].replace( rtrim, " " ) - } ); - soFar = soFar.slice( matched.length ); - } - - // Filters - for ( type in Expr.filter ) { - if ( ( match = matchExpr[ type ].exec( soFar ) ) && ( !preFilters[ type ] || - ( match = preFilters[ type ]( match ) ) ) ) { - matched = match.shift(); - tokens.push( { - value: matched, - type: type, - matches: match - } ); - soFar = soFar.slice( matched.length ); - } - } - - if ( !matched ) { - break; - } - } - - // Return the length of the invalid excess - // if we're just parsing - // Otherwise, throw an error or return tokens - return parseOnly ? - soFar.length : - soFar ? - Sizzle.error( selector ) : - - // Cache the tokens - tokenCache( selector, groups ).slice( 0 ); -}; - -function toSelector( tokens ) { - var i = 0, - len = tokens.length, - selector = ""; - for ( ; i < len; i++ ) { - selector += tokens[ i ].value; - } - return selector; -} - -function addCombinator( matcher, combinator, base ) { - var dir = combinator.dir, - skip = combinator.next, - key = skip || dir, - checkNonElements = base && key === "parentNode", - doneName = done++; - - return combinator.first ? - - // Check against closest ancestor/preceding element - function( elem, context, xml ) { - while ( ( elem = elem[ dir ] ) ) { - if ( elem.nodeType === 1 || checkNonElements ) { - return matcher( elem, context, xml ); - } - } - return false; - } : - - // Check against all ancestor/preceding elements - function( elem, context, xml ) { - var oldCache, uniqueCache, outerCache, - newCache = [ dirruns, doneName ]; - - // We can't set arbitrary data on XML nodes, so they don't benefit from combinator caching - if ( xml ) { - while ( ( elem = elem[ dir ] ) ) { - if ( elem.nodeType === 1 || checkNonElements ) { - if ( matcher( elem, context, xml ) ) { - return true; - } - } - } - } else { - while ( ( elem = elem[ dir ] ) ) { - if ( elem.nodeType === 1 || checkNonElements ) { - outerCache = elem[ expando ] || ( elem[ expando ] = {} ); - - // Support: IE <9 only - // Defend against cloned attroperties (jQuery gh-1709) - uniqueCache = outerCache[ elem.uniqueID ] || - ( outerCache[ elem.uniqueID ] = {} ); - - if ( skip && skip === elem.nodeName.toLowerCase() ) { - elem = elem[ dir ] || elem; - } else if ( ( oldCache = uniqueCache[ key ] ) && - oldCache[ 0 ] === dirruns && oldCache[ 1 ] === doneName ) { - - // Assign to newCache so results back-propagate to previous elements - return ( newCache[ 2 ] = oldCache[ 2 ] ); - } else { - - // Reuse newcache so results back-propagate to previous elements - uniqueCache[ key ] = newCache; - - // A match means we're done; a fail means we have to keep checking - if ( ( newCache[ 2 ] = matcher( elem, context, xml ) ) ) { - return true; - } - } - } - } - } - return false; - }; -} - -function elementMatcher( matchers ) { - return matchers.length > 1 ? - function( elem, context, xml ) { - var i = matchers.length; - while ( i-- ) { - if ( !matchers[ i ]( elem, context, xml ) ) { - return false; - } - } - return true; - } : - matchers[ 0 ]; -} - -function multipleContexts( selector, contexts, results ) { - var i = 0, - len = contexts.length; - for ( ; i < len; i++ ) { - Sizzle( selector, contexts[ i ], results ); - } - return results; -} - -function condense( unmatched, map, filter, context, xml ) { - var elem, - newUnmatched = [], - i = 0, - len = unmatched.length, - mapped = map != null; - - for ( ; i < len; i++ ) { - if ( ( elem = unmatched[ i ] ) ) { - if ( !filter || filter( elem, context, xml ) ) { - newUnmatched.push( elem ); - if ( mapped ) { - map.push( i ); - } - } - } - } - - return newUnmatched; -} - -function setMatcher( preFilter, selector, matcher, postFilter, postFinder, postSelector ) { - if ( postFilter && !postFilter[ expando ] ) { - postFilter = setMatcher( postFilter ); - } - if ( postFinder && !postFinder[ expando ] ) { - postFinder = setMatcher( postFinder, postSelector ); - } - return markFunction( function( seed, results, context, xml ) { - var temp, i, elem, - preMap = [], - postMap = [], - preexisting = results.length, - - // Get initial elements from seed or context - elems = seed || multipleContexts( - selector || "*", - context.nodeType ? [ context ] : context, - [] - ), - - // Prefilter to get matcher input, preserving a map for seed-results synchronization - matcherIn = preFilter && ( seed || !selector ) ? - condense( elems, preMap, preFilter, context, xml ) : - elems, - - matcherOut = matcher ? - - // If we have a postFinder, or filtered seed, or non-seed postFilter or preexisting results, - postFinder || ( seed ? preFilter : preexisting || postFilter ) ? - - // ...intermediate processing is necessary - [] : - - // ...otherwise use results directly - results : - matcherIn; - - // Find primary matches - if ( matcher ) { - matcher( matcherIn, matcherOut, context, xml ); - } - - // Apply postFilter - if ( postFilter ) { - temp = condense( matcherOut, postMap ); - postFilter( temp, [], context, xml ); - - // Un-match failing elements by moving them back to matcherIn - i = temp.length; - while ( i-- ) { - if ( ( elem = temp[ i ] ) ) { - matcherOut[ postMap[ i ] ] = !( matcherIn[ postMap[ i ] ] = elem ); - } - } - } - - if ( seed ) { - if ( postFinder || preFilter ) { - if ( postFinder ) { - - // Get the final matcherOut by condensing this intermediate into postFinder contexts - temp = []; - i = matcherOut.length; - while ( i-- ) { - if ( ( elem = matcherOut[ i ] ) ) { - - // Restore matcherIn since elem is not yet a final match - temp.push( ( matcherIn[ i ] = elem ) ); - } - } - postFinder( null, ( matcherOut = [] ), temp, xml ); - } - - // Move matched elements from seed to results to keep them synchronized - i = matcherOut.length; - while ( i-- ) { - if ( ( elem = matcherOut[ i ] ) && - ( temp = postFinder ? indexOf( seed, elem ) : preMap[ i ] ) > -1 ) { - - seed[ temp ] = !( results[ temp ] = elem ); - } - } - } - - // Add elements to results, through postFinder if defined - } else { - matcherOut = condense( - matcherOut === results ? - matcherOut.splice( preexisting, matcherOut.length ) : - matcherOut - ); - if ( postFinder ) { - postFinder( null, results, matcherOut, xml ); - } else { - push.apply( results, matcherOut ); - } - } - } ); -} - -function matcherFromTokens( tokens ) { - var checkContext, matcher, j, - len = tokens.length, - leadingRelative = Expr.relative[ tokens[ 0 ].type ], - implicitRelative = leadingRelative || Expr.relative[ " " ], - i = leadingRelative ? 1 : 0, - - // The foundational matcher ensures that elements are reachable from top-level context(s) - matchContext = addCombinator( function( elem ) { - return elem === checkContext; - }, implicitRelative, true ), - matchAnyContext = addCombinator( function( elem ) { - return indexOf( checkContext, elem ) > -1; - }, implicitRelative, true ), - matchers = [ function( elem, context, xml ) { - var ret = ( !leadingRelative && ( xml || context !== outermostContext ) ) || ( - ( checkContext = context ).nodeType ? - matchContext( elem, context, xml ) : - matchAnyContext( elem, context, xml ) ); - - // Avoid hanging onto element (issue #299) - checkContext = null; - return ret; - } ]; - - for ( ; i < len; i++ ) { - if ( ( matcher = Expr.relative[ tokens[ i ].type ] ) ) { - matchers = [ addCombinator( elementMatcher( matchers ), matcher ) ]; - } else { - matcher = Expr.filter[ tokens[ i ].type ].apply( null, tokens[ i ].matches ); - - // Return special upon seeing a positional matcher - if ( matcher[ expando ] ) { - - // Find the next relative operator (if any) for proper handling - j = ++i; - for ( ; j < len; j++ ) { - if ( Expr.relative[ tokens[ j ].type ] ) { - break; - } - } - return setMatcher( - i > 1 && elementMatcher( matchers ), - i > 1 && toSelector( - - // If the preceding token was a descendant combinator, insert an implicit any-element `*` - tokens - .slice( 0, i - 1 ) - .concat( { value: tokens[ i - 2 ].type === " " ? "*" : "" } ) - ).replace( rtrim, "$1" ), - matcher, - i < j && matcherFromTokens( tokens.slice( i, j ) ), - j < len && matcherFromTokens( ( tokens = tokens.slice( j ) ) ), - j < len && toSelector( tokens ) - ); - } - matchers.push( matcher ); - } - } - - return elementMatcher( matchers ); -} - -function matcherFromGroupMatchers( elementMatchers, setMatchers ) { - var bySet = setMatchers.length > 0, - byElement = elementMatchers.length > 0, - superMatcher = function( seed, context, xml, results, outermost ) { - var elem, j, matcher, - matchedCount = 0, - i = "0", - unmatched = seed && [], - setMatched = [], - contextBackup = outermostContext, - - // We must always have either seed elements or outermost context - elems = seed || byElement && Expr.find[ "TAG" ]( "*", outermost ), - - // Use integer dirruns iff this is the outermost matcher - dirrunsUnique = ( dirruns += contextBackup == null ? 1 : Math.random() || 0.1 ), - len = elems.length; - - if ( outermost ) { - - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - // eslint-disable-next-line eqeqeq - outermostContext = context == document || context || outermost; - } - - // Add elements passing elementMatchers directly to results - // Support: IE<9, Safari - // Tolerate NodeList properties (IE: "length"; Safari: ) matching elements by id - for ( ; i !== len && ( elem = elems[ i ] ) != null; i++ ) { - if ( byElement && elem ) { - j = 0; - - // Support: IE 11+, Edge 17 - 18+ - // IE/Edge sometimes throw a "Permission denied" error when strict-comparing - // two documents; shallow comparisons work. - // eslint-disable-next-line eqeqeq - if ( !context && elem.ownerDocument != document ) { - setDocument( elem ); - xml = !documentIsHTML; - } - while ( ( matcher = elementMatchers[ j++ ] ) ) { - if ( matcher( elem, context || document, xml ) ) { - results.push( elem ); - break; - } - } - if ( outermost ) { - dirruns = dirrunsUnique; - } - } - - // Track unmatched elements for set filters - if ( bySet ) { - - // They will have gone through all possible matchers - if ( ( elem = !matcher && elem ) ) { - matchedCount--; - } - - // Lengthen the array for every element, matched or not - if ( seed ) { - unmatched.push( elem ); - } - } - } - - // `i` is now the count of elements visited above, and adding it to `matchedCount` - // makes the latter nonnegative. - matchedCount += i; - - // Apply set filters to unmatched elements - // NOTE: This can be skipped if there are no unmatched elements (i.e., `matchedCount` - // equals `i`), unless we didn't visit _any_ elements in the above loop because we have - // no element matchers and no seed. - // Incrementing an initially-string "0" `i` allows `i` to remain a string only in that - // case, which will result in a "00" `matchedCount` that differs from `i` but is also - // numerically zero. - if ( bySet && i !== matchedCount ) { - j = 0; - while ( ( matcher = setMatchers[ j++ ] ) ) { - matcher( unmatched, setMatched, context, xml ); - } - - if ( seed ) { - - // Reintegrate element matches to eliminate the need for sorting - if ( matchedCount > 0 ) { - while ( i-- ) { - if ( !( unmatched[ i ] || setMatched[ i ] ) ) { - setMatched[ i ] = pop.call( results ); - } - } - } - - // Discard index placeholder values to get only actual matches - setMatched = condense( setMatched ); - } - - // Add matches to results - push.apply( results, setMatched ); - - // Seedless set matches succeeding multiple successful matchers stipulate sorting - if ( outermost && !seed && setMatched.length > 0 && - ( matchedCount + setMatchers.length ) > 1 ) { - - Sizzle.uniqueSort( results ); - } - } - - // Override manipulation of globals by nested matchers - if ( outermost ) { - dirruns = dirrunsUnique; - outermostContext = contextBackup; - } - - return unmatched; - }; - - return bySet ? - markFunction( superMatcher ) : - superMatcher; -} - -compile = Sizzle.compile = function( selector, match /* Internal Use Only */ ) { - var i, - setMatchers = [], - elementMatchers = [], - cached = compilerCache[ selector + " " ]; - - if ( !cached ) { - - // Generate a function of recursive functions that can be used to check each element - if ( !match ) { - match = tokenize( selector ); - } - i = match.length; - while ( i-- ) { - cached = matcherFromTokens( match[ i ] ); - if ( cached[ expando ] ) { - setMatchers.push( cached ); - } else { - elementMatchers.push( cached ); - } - } - - // Cache the compiled function - cached = compilerCache( - selector, - matcherFromGroupMatchers( elementMatchers, setMatchers ) - ); - - // Save selector and tokenization - cached.selector = selector; - } - return cached; -}; - -/** - * A low-level selection function that works with Sizzle's compiled - * selector functions - * @param {String|Function} selector A selector or a pre-compiled - * selector function built with Sizzle.compile - * @param {Element} context - * @param {Array} [results] - * @param {Array} [seed] A set of elements to match against - */ -select = Sizzle.select = function( selector, context, results, seed ) { - var i, tokens, token, type, find, - compiled = typeof selector === "function" && selector, - match = !seed && tokenize( ( selector = compiled.selector || selector ) ); - - results = results || []; - - // Try to minimize operations if there is only one selector in the list and no seed - // (the latter of which guarantees us context) - if ( match.length === 1 ) { - - // Reduce context if the leading compound selector is an ID - tokens = match[ 0 ] = match[ 0 ].slice( 0 ); - if ( tokens.length > 2 && ( token = tokens[ 0 ] ).type === "ID" && - context.nodeType === 9 && documentIsHTML && Expr.relative[ tokens[ 1 ].type ] ) { - - context = ( Expr.find[ "ID" ]( token.matches[ 0 ] - .replace( runescape, funescape ), context ) || [] )[ 0 ]; - if ( !context ) { - return results; - - // Precompiled matchers will still verify ancestry, so step up a level - } else if ( compiled ) { - context = context.parentNode; - } - - selector = selector.slice( tokens.shift().value.length ); - } - - // Fetch a seed set for right-to-left matching - i = matchExpr[ "needsContext" ].test( selector ) ? 0 : tokens.length; - while ( i-- ) { - token = tokens[ i ]; - - // Abort if we hit a combinator - if ( Expr.relative[ ( type = token.type ) ] ) { - break; - } - if ( ( find = Expr.find[ type ] ) ) { - - // Search, expanding context for leading sibling combinators - if ( ( seed = find( - token.matches[ 0 ].replace( runescape, funescape ), - rsibling.test( tokens[ 0 ].type ) && testContext( context.parentNode ) || - context - ) ) ) { - - // If seed is empty or no tokens remain, we can return early - tokens.splice( i, 1 ); - selector = seed.length && toSelector( tokens ); - if ( !selector ) { - push.apply( results, seed ); - return results; - } - - break; - } - } - } - } - - // Compile and execute a filtering function if one is not provided - // Provide `match` to avoid retokenization if we modified the selector above - ( compiled || compile( selector, match ) )( - seed, - context, - !documentIsHTML, - results, - !context || rsibling.test( selector ) && testContext( context.parentNode ) || context - ); - return results; -}; - -// One-time assignments - -// Sort stability -support.sortStable = expando.split( "" ).sort( sortOrder ).join( "" ) === expando; - -// Support: Chrome 14-35+ -// Always assume duplicates if they aren't passed to the comparison function -support.detectDuplicates = !!hasDuplicate; - -// Initialize against the default document -setDocument(); - -// Support: Webkit<537.32 - Safari 6.0.3/Chrome 25 (fixed in Chrome 27) -// Detached nodes confoundingly follow *each other* -support.sortDetached = assert( function( el ) { - - // Should return 1, but returns 4 (following) - return el.compareDocumentPosition( document.createElement( "fieldset" ) ) & 1; -} ); - -// Support: IE<8 -// Prevent attribute/property "interpolation" -// https://msdn.microsoft.com/en-us/library/ms536429%28VS.85%29.aspx -if ( !assert( function( el ) { - el.innerHTML = ""; - return el.firstChild.getAttribute( "href" ) === "#"; -} ) ) { - addHandle( "type|href|height|width", function( elem, name, isXML ) { - if ( !isXML ) { - return elem.getAttribute( name, name.toLowerCase() === "type" ? 1 : 2 ); - } - } ); -} - -// Support: IE<9 -// Use defaultValue in place of getAttribute("value") -if ( !support.attributes || !assert( function( el ) { - el.innerHTML = ""; - el.firstChild.setAttribute( "value", "" ); - return el.firstChild.getAttribute( "value" ) === ""; -} ) ) { - addHandle( "value", function( elem, _name, isXML ) { - if ( !isXML && elem.nodeName.toLowerCase() === "input" ) { - return elem.defaultValue; - } - } ); -} - -// Support: IE<9 -// Use getAttributeNode to fetch booleans when getAttribute lies -if ( !assert( function( el ) { - return el.getAttribute( "disabled" ) == null; -} ) ) { - addHandle( booleans, function( elem, name, isXML ) { - var val; - if ( !isXML ) { - return elem[ name ] === true ? name.toLowerCase() : - ( val = elem.getAttributeNode( name ) ) && val.specified ? - val.value : - null; - } - } ); -} - -return Sizzle; - -} )( window ); - - - -jQuery.find = Sizzle; -jQuery.expr = Sizzle.selectors; - -// Deprecated -jQuery.expr[ ":" ] = jQuery.expr.pseudos; -jQuery.uniqueSort = jQuery.unique = Sizzle.uniqueSort; -jQuery.text = Sizzle.getText; -jQuery.isXMLDoc = Sizzle.isXML; -jQuery.contains = Sizzle.contains; -jQuery.escapeSelector = Sizzle.escape; - - - - -var dir = function( elem, dir, until ) { - var matched = [], - truncate = until !== undefined; - - while ( ( elem = elem[ dir ] ) && elem.nodeType !== 9 ) { - if ( elem.nodeType === 1 ) { - if ( truncate && jQuery( elem ).is( until ) ) { - break; - } - matched.push( elem ); - } - } - return matched; -}; - - -var siblings = function( n, elem ) { - var matched = []; - - for ( ; n; n = n.nextSibling ) { - if ( n.nodeType === 1 && n !== elem ) { - matched.push( n ); - } - } - - return matched; -}; - - -var rneedsContext = jQuery.expr.match.needsContext; - - - -function nodeName( elem, name ) { - - return elem.nodeName && elem.nodeName.toLowerCase() === name.toLowerCase(); - -}; -var rsingleTag = ( /^<([a-z][^\/\0>:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i ); - - - -// Implement the identical functionality for filter and not -function winnow( elements, qualifier, not ) { - if ( isFunction( qualifier ) ) { - return jQuery.grep( elements, function( elem, i ) { - return !!qualifier.call( elem, i, elem ) !== not; - } ); - } - - // Single element - if ( qualifier.nodeType ) { - return jQuery.grep( elements, function( elem ) { - return ( elem === qualifier ) !== not; - } ); - } - - // Arraylike of elements (jQuery, arguments, Array) - if ( typeof qualifier !== "string" ) { - return jQuery.grep( elements, function( elem ) { - return ( indexOf.call( qualifier, elem ) > -1 ) !== not; - } ); - } - - // Filtered directly for both simple and complex selectors - return jQuery.filter( qualifier, elements, not ); -} - -jQuery.filter = function( expr, elems, not ) { - var elem = elems[ 0 ]; - - if ( not ) { - expr = ":not(" + expr + ")"; - } - - if ( elems.length === 1 && elem.nodeType === 1 ) { - return jQuery.find.matchesSelector( elem, expr ) ? [ elem ] : []; - } - - return jQuery.find.matches( expr, jQuery.grep( elems, function( elem ) { - return elem.nodeType === 1; - } ) ); -}; - -jQuery.fn.extend( { - find: function( selector ) { - var i, ret, - len = this.length, - self = this; - - if ( typeof selector !== "string" ) { - return this.pushStack( jQuery( selector ).filter( function() { - for ( i = 0; i < len; i++ ) { - if ( jQuery.contains( self[ i ], this ) ) { - return true; - } - } - } ) ); - } - - ret = this.pushStack( [] ); - - for ( i = 0; i < len; i++ ) { - jQuery.find( selector, self[ i ], ret ); - } - - return len > 1 ? jQuery.uniqueSort( ret ) : ret; - }, - filter: function( selector ) { - return this.pushStack( winnow( this, selector || [], false ) ); - }, - not: function( selector ) { - return this.pushStack( winnow( this, selector || [], true ) ); - }, - is: function( selector ) { - return !!winnow( - this, - - // If this is a positional/relative selector, check membership in the returned set - // so $("p:first").is("p:last") won't return true for a doc with two "p". - typeof selector === "string" && rneedsContext.test( selector ) ? - jQuery( selector ) : - selector || [], - false - ).length; - } -} ); - - -// Initialize a jQuery object - - -// A central reference to the root jQuery(document) -var rootjQuery, - - // A simple way to check for HTML strings - // Prioritize #id over to avoid XSS via location.hash (#9521) - // Strict HTML recognition (#11290: must start with <) - // Shortcut simple #id case for speed - rquickExpr = /^(?:\s*(<[\w\W]+>)[^>]*|#([\w-]+))$/, - - init = jQuery.fn.init = function( selector, context, root ) { - var match, elem; - - // HANDLE: $(""), $(null), $(undefined), $(false) - if ( !selector ) { - return this; - } - - // Method init() accepts an alternate rootjQuery - // so migrate can support jQuery.sub (gh-2101) - root = root || rootjQuery; - - // Handle HTML strings - if ( typeof selector === "string" ) { - if ( selector[ 0 ] === "<" && - selector[ selector.length - 1 ] === ">" && - selector.length >= 3 ) { - - // Assume that strings that start and end with <> are HTML and skip the regex check - match = [ null, selector, null ]; - - } else { - match = rquickExpr.exec( selector ); - } - - // Match html or make sure no context is specified for #id - if ( match && ( match[ 1 ] || !context ) ) { - - // HANDLE: $(html) -> $(array) - if ( match[ 1 ] ) { - context = context instanceof jQuery ? context[ 0 ] : context; - - // Option to run scripts is true for back-compat - // Intentionally let the error be thrown if parseHTML is not present - jQuery.merge( this, jQuery.parseHTML( - match[ 1 ], - context && context.nodeType ? context.ownerDocument || context : document, - true - ) ); - - // HANDLE: $(html, props) - if ( rsingleTag.test( match[ 1 ] ) && jQuery.isPlainObject( context ) ) { - for ( match in context ) { - - // Properties of context are called as methods if possible - if ( isFunction( this[ match ] ) ) { - this[ match ]( context[ match ] ); - - // ...and otherwise set as attributes - } else { - this.attr( match, context[ match ] ); - } - } - } - - return this; - - // HANDLE: $(#id) - } else { - elem = document.getElementById( match[ 2 ] ); - - if ( elem ) { - - // Inject the element directly into the jQuery object - this[ 0 ] = elem; - this.length = 1; - } - return this; - } - - // HANDLE: $(expr, $(...)) - } else if ( !context || context.jquery ) { - return ( context || root ).find( selector ); - - // HANDLE: $(expr, context) - // (which is just equivalent to: $(context).find(expr) - } else { - return this.constructor( context ).find( selector ); - } - - // HANDLE: $(DOMElement) - } else if ( selector.nodeType ) { - this[ 0 ] = selector; - this.length = 1; - return this; - - // HANDLE: $(function) - // Shortcut for document ready - } else if ( isFunction( selector ) ) { - return root.ready !== undefined ? - root.ready( selector ) : - - // Execute immediately if ready is not present - selector( jQuery ); - } - - return jQuery.makeArray( selector, this ); - }; - -// Give the init function the jQuery prototype for later instantiation -init.prototype = jQuery.fn; - -// Initialize central reference -rootjQuery = jQuery( document ); - - -var rparentsprev = /^(?:parents|prev(?:Until|All))/, - - // Methods guaranteed to produce a unique set when starting from a unique set - guaranteedUnique = { - children: true, - contents: true, - next: true, - prev: true - }; - -jQuery.fn.extend( { - has: function( target ) { - var targets = jQuery( target, this ), - l = targets.length; - - return this.filter( function() { - var i = 0; - for ( ; i < l; i++ ) { - if ( jQuery.contains( this, targets[ i ] ) ) { - return true; - } - } - } ); - }, - - closest: function( selectors, context ) { - var cur, - i = 0, - l = this.length, - matched = [], - targets = typeof selectors !== "string" && jQuery( selectors ); - - // Positional selectors never match, since there's no _selection_ context - if ( !rneedsContext.test( selectors ) ) { - for ( ; i < l; i++ ) { - for ( cur = this[ i ]; cur && cur !== context; cur = cur.parentNode ) { - - // Always skip document fragments - if ( cur.nodeType < 11 && ( targets ? - targets.index( cur ) > -1 : - - // Don't pass non-elements to Sizzle - cur.nodeType === 1 && - jQuery.find.matchesSelector( cur, selectors ) ) ) { - - matched.push( cur ); - break; - } - } - } - } - - return this.pushStack( matched.length > 1 ? jQuery.uniqueSort( matched ) : matched ); - }, - - // Determine the position of an element within the set - index: function( elem ) { - - // No argument, return index in parent - if ( !elem ) { - return ( this[ 0 ] && this[ 0 ].parentNode ) ? this.first().prevAll().length : -1; - } - - // Index in selector - if ( typeof elem === "string" ) { - return indexOf.call( jQuery( elem ), this[ 0 ] ); - } - - // Locate the position of the desired element - return indexOf.call( this, - - // If it receives a jQuery object, the first element is used - elem.jquery ? elem[ 0 ] : elem - ); - }, - - add: function( selector, context ) { - return this.pushStack( - jQuery.uniqueSort( - jQuery.merge( this.get(), jQuery( selector, context ) ) - ) - ); - }, - - addBack: function( selector ) { - return this.add( selector == null ? - this.prevObject : this.prevObject.filter( selector ) - ); - } -} ); - -function sibling( cur, dir ) { - while ( ( cur = cur[ dir ] ) && cur.nodeType !== 1 ) {} - return cur; -} - -jQuery.each( { - parent: function( elem ) { - var parent = elem.parentNode; - return parent && parent.nodeType !== 11 ? parent : null; - }, - parents: function( elem ) { - return dir( elem, "parentNode" ); - }, - parentsUntil: function( elem, _i, until ) { - return dir( elem, "parentNode", until ); - }, - next: function( elem ) { - return sibling( elem, "nextSibling" ); - }, - prev: function( elem ) { - return sibling( elem, "previousSibling" ); - }, - nextAll: function( elem ) { - return dir( elem, "nextSibling" ); - }, - prevAll: function( elem ) { - return dir( elem, "previousSibling" ); - }, - nextUntil: function( elem, _i, until ) { - return dir( elem, "nextSibling", until ); - }, - prevUntil: function( elem, _i, until ) { - return dir( elem, "previousSibling", until ); - }, - siblings: function( elem ) { - return siblings( ( elem.parentNode || {} ).firstChild, elem ); - }, - children: function( elem ) { - return siblings( elem.firstChild ); - }, - contents: function( elem ) { - if ( elem.contentDocument != null && - - // Support: IE 11+ - // elements with no `data` attribute has an object - // `contentDocument` with a `null` prototype. - getProto( elem.contentDocument ) ) { - - return elem.contentDocument; - } - - // Support: IE 9 - 11 only, iOS 7 only, Android Browser <=4.3 only - // Treat the template element as a regular one in browsers that - // don't support it. - if ( nodeName( elem, "template" ) ) { - elem = elem.content || elem; - } - - return jQuery.merge( [], elem.childNodes ); - } -}, function( name, fn ) { - jQuery.fn[ name ] = function( until, selector ) { - var matched = jQuery.map( this, fn, until ); - - if ( name.slice( -5 ) !== "Until" ) { - selector = until; - } - - if ( selector && typeof selector === "string" ) { - matched = jQuery.filter( selector, matched ); - } - - if ( this.length > 1 ) { - - // Remove duplicates - if ( !guaranteedUnique[ name ] ) { - jQuery.uniqueSort( matched ); - } - - // Reverse order for parents* and prev-derivatives - if ( rparentsprev.test( name ) ) { - matched.reverse(); - } - } - - return this.pushStack( matched ); - }; -} ); -var rnothtmlwhite = ( /[^\x20\t\r\n\f]+/g ); - - - -// Convert String-formatted options into Object-formatted ones -function createOptions( options ) { - var object = {}; - jQuery.each( options.match( rnothtmlwhite ) || [], function( _, flag ) { - object[ flag ] = true; - } ); - return object; -} - -/* - * Create a callback list using the following parameters: - * - * options: an optional list of space-separated options that will change how - * the callback list behaves or a more traditional option object - * - * By default a callback list will act like an event callback list and can be - * "fired" multiple times. - * - * Possible options: - * - * once: will ensure the callback list can only be fired once (like a Deferred) - * - * memory: will keep track of previous values and will call any callback added - * after the list has been fired right away with the latest "memorized" - * values (like a Deferred) - * - * unique: will ensure a callback can only be added once (no duplicate in the list) - * - * stopOnFalse: interrupt callings when a callback returns false - * - */ -jQuery.Callbacks = function( options ) { - - // Convert options from String-formatted to Object-formatted if needed - // (we check in cache first) - options = typeof options === "string" ? - createOptions( options ) : - jQuery.extend( {}, options ); - - var // Flag to know if list is currently firing - firing, - - // Last fire value for non-forgettable lists - memory, - - // Flag to know if list was already fired - fired, - - // Flag to prevent firing - locked, - - // Actual callback list - list = [], - - // Queue of execution data for repeatable lists - queue = [], - - // Index of currently firing callback (modified by add/remove as needed) - firingIndex = -1, - - // Fire callbacks - fire = function() { - - // Enforce single-firing - locked = locked || options.once; - - // Execute callbacks for all pending executions, - // respecting firingIndex overrides and runtime changes - fired = firing = true; - for ( ; queue.length; firingIndex = -1 ) { - memory = queue.shift(); - while ( ++firingIndex < list.length ) { - - // Run callback and check for early termination - if ( list[ firingIndex ].apply( memory[ 0 ], memory[ 1 ] ) === false && - options.stopOnFalse ) { - - // Jump to end and forget the data so .add doesn't re-fire - firingIndex = list.length; - memory = false; - } - } - } - - // Forget the data if we're done with it - if ( !options.memory ) { - memory = false; - } - - firing = false; - - // Clean up if we're done firing for good - if ( locked ) { - - // Keep an empty list if we have data for future add calls - if ( memory ) { - list = []; - - // Otherwise, this object is spent - } else { - list = ""; - } - } - }, - - // Actual Callbacks object - self = { - - // Add a callback or a collection of callbacks to the list - add: function() { - if ( list ) { - - // If we have memory from a past run, we should fire after adding - if ( memory && !firing ) { - firingIndex = list.length - 1; - queue.push( memory ); - } - - ( function add( args ) { - jQuery.each( args, function( _, arg ) { - if ( isFunction( arg ) ) { - if ( !options.unique || !self.has( arg ) ) { - list.push( arg ); - } - } else if ( arg && arg.length && toType( arg ) !== "string" ) { - - // Inspect recursively - add( arg ); - } - } ); - } )( arguments ); - - if ( memory && !firing ) { - fire(); - } - } - return this; - }, - - // Remove a callback from the list - remove: function() { - jQuery.each( arguments, function( _, arg ) { - var index; - while ( ( index = jQuery.inArray( arg, list, index ) ) > -1 ) { - list.splice( index, 1 ); - - // Handle firing indexes - if ( index <= firingIndex ) { - firingIndex--; - } - } - } ); - return this; - }, - - // Check if a given callback is in the list. - // If no argument is given, return whether or not list has callbacks attached. - has: function( fn ) { - return fn ? - jQuery.inArray( fn, list ) > -1 : - list.length > 0; - }, - - // Remove all callbacks from the list - empty: function() { - if ( list ) { - list = []; - } - return this; - }, - - // Disable .fire and .add - // Abort any current/pending executions - // Clear all callbacks and values - disable: function() { - locked = queue = []; - list = memory = ""; - return this; - }, - disabled: function() { - return !list; - }, - - // Disable .fire - // Also disable .add unless we have memory (since it would have no effect) - // Abort any pending executions - lock: function() { - locked = queue = []; - if ( !memory && !firing ) { - list = memory = ""; - } - return this; - }, - locked: function() { - return !!locked; - }, - - // Call all callbacks with the given context and arguments - fireWith: function( context, args ) { - if ( !locked ) { - args = args || []; - args = [ context, args.slice ? args.slice() : args ]; - queue.push( args ); - if ( !firing ) { - fire(); - } - } - return this; - }, - - // Call all the callbacks with the given arguments - fire: function() { - self.fireWith( this, arguments ); - return this; - }, - - // To know if the callbacks have already been called at least once - fired: function() { - return !!fired; - } - }; - - return self; -}; - - -function Identity( v ) { - return v; -} -function Thrower( ex ) { - throw ex; -} - -function adoptValue( value, resolve, reject, noValue ) { - var method; - - try { - - // Check for promise aspect first to privilege synchronous behavior - if ( value && isFunction( ( method = value.promise ) ) ) { - method.call( value ).done( resolve ).fail( reject ); - - // Other thenables - } else if ( value && isFunction( ( method = value.then ) ) ) { - method.call( value, resolve, reject ); - - // Other non-thenables - } else { - - // Control `resolve` arguments by letting Array#slice cast boolean `noValue` to integer: - // * false: [ value ].slice( 0 ) => resolve( value ) - // * true: [ value ].slice( 1 ) => resolve() - resolve.apply( undefined, [ value ].slice( noValue ) ); - } - - // For Promises/A+, convert exceptions into rejections - // Since jQuery.when doesn't unwrap thenables, we can skip the extra checks appearing in - // Deferred#then to conditionally suppress rejection. - } catch ( value ) { - - // Support: Android 4.0 only - // Strict mode functions invoked without .call/.apply get global-object context - reject.apply( undefined, [ value ] ); - } -} - -jQuery.extend( { - - Deferred: function( func ) { - var tuples = [ - - // action, add listener, callbacks, - // ... .then handlers, argument index, [final state] - [ "notify", "progress", jQuery.Callbacks( "memory" ), - jQuery.Callbacks( "memory" ), 2 ], - [ "resolve", "done", jQuery.Callbacks( "once memory" ), - jQuery.Callbacks( "once memory" ), 0, "resolved" ], - [ "reject", "fail", jQuery.Callbacks( "once memory" ), - jQuery.Callbacks( "once memory" ), 1, "rejected" ] - ], - state = "pending", - promise = { - state: function() { - return state; - }, - always: function() { - deferred.done( arguments ).fail( arguments ); - return this; - }, - "catch": function( fn ) { - return promise.then( null, fn ); - }, - - // Keep pipe for back-compat - pipe: function( /* fnDone, fnFail, fnProgress */ ) { - var fns = arguments; - - return jQuery.Deferred( function( newDefer ) { - jQuery.each( tuples, function( _i, tuple ) { - - // Map tuples (progress, done, fail) to arguments (done, fail, progress) - var fn = isFunction( fns[ tuple[ 4 ] ] ) && fns[ tuple[ 4 ] ]; - - // deferred.progress(function() { bind to newDefer or newDefer.notify }) - // deferred.done(function() { bind to newDefer or newDefer.resolve }) - // deferred.fail(function() { bind to newDefer or newDefer.reject }) - deferred[ tuple[ 1 ] ]( function() { - var returned = fn && fn.apply( this, arguments ); - if ( returned && isFunction( returned.promise ) ) { - returned.promise() - .progress( newDefer.notify ) - .done( newDefer.resolve ) - .fail( newDefer.reject ); - } else { - newDefer[ tuple[ 0 ] + "With" ]( - this, - fn ? [ returned ] : arguments - ); - } - } ); - } ); - fns = null; - } ).promise(); - }, - then: function( onFulfilled, onRejected, onProgress ) { - var maxDepth = 0; - function resolve( depth, deferred, handler, special ) { - return function() { - var that = this, - args = arguments, - mightThrow = function() { - var returned, then; - - // Support: Promises/A+ section 2.3.3.3.3 - // https://promisesaplus.com/#point-59 - // Ignore double-resolution attempts - if ( depth < maxDepth ) { - return; - } - - returned = handler.apply( that, args ); - - // Support: Promises/A+ section 2.3.1 - // https://promisesaplus.com/#point-48 - if ( returned === deferred.promise() ) { - throw new TypeError( "Thenable self-resolution" ); - } - - // Support: Promises/A+ sections 2.3.3.1, 3.5 - // https://promisesaplus.com/#point-54 - // https://promisesaplus.com/#point-75 - // Retrieve `then` only once - then = returned && - - // Support: Promises/A+ section 2.3.4 - // https://promisesaplus.com/#point-64 - // Only check objects and functions for thenability - ( typeof returned === "object" || - typeof returned === "function" ) && - returned.then; - - // Handle a returned thenable - if ( isFunction( then ) ) { - - // Special processors (notify) just wait for resolution - if ( special ) { - then.call( - returned, - resolve( maxDepth, deferred, Identity, special ), - resolve( maxDepth, deferred, Thrower, special ) - ); - - // Normal processors (resolve) also hook into progress - } else { - - // ...and disregard older resolution values - maxDepth++; - - then.call( - returned, - resolve( maxDepth, deferred, Identity, special ), - resolve( maxDepth, deferred, Thrower, special ), - resolve( maxDepth, deferred, Identity, - deferred.notifyWith ) - ); - } - - // Handle all other returned values - } else { - - // Only substitute handlers pass on context - // and multiple values (non-spec behavior) - if ( handler !== Identity ) { - that = undefined; - args = [ returned ]; - } - - // Process the value(s) - // Default process is resolve - ( special || deferred.resolveWith )( that, args ); - } - }, - - // Only normal processors (resolve) catch and reject exceptions - process = special ? - mightThrow : - function() { - try { - mightThrow(); - } catch ( e ) { - - if ( jQuery.Deferred.exceptionHook ) { - jQuery.Deferred.exceptionHook( e, - process.stackTrace ); - } - - // Support: Promises/A+ section 2.3.3.3.4.1 - // https://promisesaplus.com/#point-61 - // Ignore post-resolution exceptions - if ( depth + 1 >= maxDepth ) { - - // Only substitute handlers pass on context - // and multiple values (non-spec behavior) - if ( handler !== Thrower ) { - that = undefined; - args = [ e ]; - } - - deferred.rejectWith( that, args ); - } - } - }; - - // Support: Promises/A+ section 2.3.3.3.1 - // https://promisesaplus.com/#point-57 - // Re-resolve promises immediately to dodge false rejection from - // subsequent errors - if ( depth ) { - process(); - } else { - - // Call an optional hook to record the stack, in case of exception - // since it's otherwise lost when execution goes async - if ( jQuery.Deferred.getStackHook ) { - process.stackTrace = jQuery.Deferred.getStackHook(); - } - window.setTimeout( process ); - } - }; - } - - return jQuery.Deferred( function( newDefer ) { - - // progress_handlers.add( ... ) - tuples[ 0 ][ 3 ].add( - resolve( - 0, - newDefer, - isFunction( onProgress ) ? - onProgress : - Identity, - newDefer.notifyWith - ) - ); - - // fulfilled_handlers.add( ... ) - tuples[ 1 ][ 3 ].add( - resolve( - 0, - newDefer, - isFunction( onFulfilled ) ? - onFulfilled : - Identity - ) - ); - - // rejected_handlers.add( ... ) - tuples[ 2 ][ 3 ].add( - resolve( - 0, - newDefer, - isFunction( onRejected ) ? - onRejected : - Thrower - ) - ); - } ).promise(); - }, - - // Get a promise for this deferred - // If obj is provided, the promise aspect is added to the object - promise: function( obj ) { - return obj != null ? jQuery.extend( obj, promise ) : promise; - } - }, - deferred = {}; - - // Add list-specific methods - jQuery.each( tuples, function( i, tuple ) { - var list = tuple[ 2 ], - stateString = tuple[ 5 ]; - - // promise.progress = list.add - // promise.done = list.add - // promise.fail = list.add - promise[ tuple[ 1 ] ] = list.add; - - // Handle state - if ( stateString ) { - list.add( - function() { - - // state = "resolved" (i.e., fulfilled) - // state = "rejected" - state = stateString; - }, - - // rejected_callbacks.disable - // fulfilled_callbacks.disable - tuples[ 3 - i ][ 2 ].disable, - - // rejected_handlers.disable - // fulfilled_handlers.disable - tuples[ 3 - i ][ 3 ].disable, - - // progress_callbacks.lock - tuples[ 0 ][ 2 ].lock, - - // progress_handlers.lock - tuples[ 0 ][ 3 ].lock - ); - } - - // progress_handlers.fire - // fulfilled_handlers.fire - // rejected_handlers.fire - list.add( tuple[ 3 ].fire ); - - // deferred.notify = function() { deferred.notifyWith(...) } - // deferred.resolve = function() { deferred.resolveWith(...) } - // deferred.reject = function() { deferred.rejectWith(...) } - deferred[ tuple[ 0 ] ] = function() { - deferred[ tuple[ 0 ] + "With" ]( this === deferred ? undefined : this, arguments ); - return this; - }; - - // deferred.notifyWith = list.fireWith - // deferred.resolveWith = list.fireWith - // deferred.rejectWith = list.fireWith - deferred[ tuple[ 0 ] + "With" ] = list.fireWith; - } ); - - // Make the deferred a promise - promise.promise( deferred ); - - // Call given func if any - if ( func ) { - func.call( deferred, deferred ); - } - - // All done! - return deferred; - }, - - // Deferred helper - when: function( singleValue ) { - var - - // count of uncompleted subordinates - remaining = arguments.length, - - // count of unprocessed arguments - i = remaining, - - // subordinate fulfillment data - resolveContexts = Array( i ), - resolveValues = slice.call( arguments ), - - // the master Deferred - master = jQuery.Deferred(), - - // subordinate callback factory - updateFunc = function( i ) { - return function( value ) { - resolveContexts[ i ] = this; - resolveValues[ i ] = arguments.length > 1 ? slice.call( arguments ) : value; - if ( !( --remaining ) ) { - master.resolveWith( resolveContexts, resolveValues ); - } - }; - }; - - // Single- and empty arguments are adopted like Promise.resolve - if ( remaining <= 1 ) { - adoptValue( singleValue, master.done( updateFunc( i ) ).resolve, master.reject, - !remaining ); - - // Use .then() to unwrap secondary thenables (cf. gh-3000) - if ( master.state() === "pending" || - isFunction( resolveValues[ i ] && resolveValues[ i ].then ) ) { - - return master.then(); - } - } - - // Multiple arguments are aggregated like Promise.all array elements - while ( i-- ) { - adoptValue( resolveValues[ i ], updateFunc( i ), master.reject ); - } - - return master.promise(); - } -} ); - - -// These usually indicate a programmer mistake during development, -// warn about them ASAP rather than swallowing them by default. -var rerrorNames = /^(Eval|Internal|Range|Reference|Syntax|Type|URI)Error$/; - -jQuery.Deferred.exceptionHook = function( error, stack ) { - - // Support: IE 8 - 9 only - // Console exists when dev tools are open, which can happen at any time - if ( window.console && window.console.warn && error && rerrorNames.test( error.name ) ) { - window.console.warn( "jQuery.Deferred exception: " + error.message, error.stack, stack ); - } -}; - - - - -jQuery.readyException = function( error ) { - window.setTimeout( function() { - throw error; - } ); -}; - - - - -// The deferred used on DOM ready -var readyList = jQuery.Deferred(); - -jQuery.fn.ready = function( fn ) { - - readyList - .then( fn ) - - // Wrap jQuery.readyException in a function so that the lookup - // happens at the time of error handling instead of callback - // registration. - .catch( function( error ) { - jQuery.readyException( error ); - } ); - - return this; -}; - -jQuery.extend( { - - // Is the DOM ready to be used? Set to true once it occurs. - isReady: false, - - // A counter to track how many items to wait for before - // the ready event fires. See #6781 - readyWait: 1, - - // Handle when the DOM is ready - ready: function( wait ) { - - // Abort if there are pending holds or we're already ready - if ( wait === true ? --jQuery.readyWait : jQuery.isReady ) { - return; - } - - // Remember that the DOM is ready - jQuery.isReady = true; - - // If a normal DOM Ready event fired, decrement, and wait if need be - if ( wait !== true && --jQuery.readyWait > 0 ) { - return; - } - - // If there are functions bound, to execute - readyList.resolveWith( document, [ jQuery ] ); - } -} ); - -jQuery.ready.then = readyList.then; - -// The ready event handler and self cleanup method -function completed() { - document.removeEventListener( "DOMContentLoaded", completed ); - window.removeEventListener( "load", completed ); - jQuery.ready(); -} - -// Catch cases where $(document).ready() is called -// after the browser event has already occurred. -// Support: IE <=9 - 10 only -// Older IE sometimes signals "interactive" too soon -if ( document.readyState === "complete" || - ( document.readyState !== "loading" && !document.documentElement.doScroll ) ) { - - // Handle it asynchronously to allow scripts the opportunity to delay ready - window.setTimeout( jQuery.ready ); - -} else { - - // Use the handy event callback - document.addEventListener( "DOMContentLoaded", completed ); - - // A fallback to window.onload, that will always work - window.addEventListener( "load", completed ); -} - - - - -// Multifunctional method to get and set values of a collection -// The value/s can optionally be executed if it's a function -var access = function( elems, fn, key, value, chainable, emptyGet, raw ) { - var i = 0, - len = elems.length, - bulk = key == null; - - // Sets many values - if ( toType( key ) === "object" ) { - chainable = true; - for ( i in key ) { - access( elems, fn, i, key[ i ], true, emptyGet, raw ); - } - - // Sets one value - } else if ( value !== undefined ) { - chainable = true; - - if ( !isFunction( value ) ) { - raw = true; - } - - if ( bulk ) { - - // Bulk operations run against the entire set - if ( raw ) { - fn.call( elems, value ); - fn = null; - - // ...except when executing function values - } else { - bulk = fn; - fn = function( elem, _key, value ) { - return bulk.call( jQuery( elem ), value ); - }; - } - } - - if ( fn ) { - for ( ; i < len; i++ ) { - fn( - elems[ i ], key, raw ? - value : - value.call( elems[ i ], i, fn( elems[ i ], key ) ) - ); - } - } - } - - if ( chainable ) { - return elems; - } - - // Gets - if ( bulk ) { - return fn.call( elems ); - } - - return len ? fn( elems[ 0 ], key ) : emptyGet; -}; - - -// Matches dashed string for camelizing -var rmsPrefix = /^-ms-/, - rdashAlpha = /-([a-z])/g; - -// Used by camelCase as callback to replace() -function fcamelCase( _all, letter ) { - return letter.toUpperCase(); -} - -// Convert dashed to camelCase; used by the css and data modules -// Support: IE <=9 - 11, Edge 12 - 15 -// Microsoft forgot to hump their vendor prefix (#9572) -function camelCase( string ) { - return string.replace( rmsPrefix, "ms-" ).replace( rdashAlpha, fcamelCase ); -} -var acceptData = function( owner ) { - - // Accepts only: - // - Node - // - Node.ELEMENT_NODE - // - Node.DOCUMENT_NODE - // - Object - // - Any - return owner.nodeType === 1 || owner.nodeType === 9 || !( +owner.nodeType ); -}; - - - - -function Data() { - this.expando = jQuery.expando + Data.uid++; -} - -Data.uid = 1; - -Data.prototype = { - - cache: function( owner ) { - - // Check if the owner object already has a cache - var value = owner[ this.expando ]; - - // If not, create one - if ( !value ) { - value = {}; - - // We can accept data for non-element nodes in modern browsers, - // but we should not, see #8335. - // Always return an empty object. - if ( acceptData( owner ) ) { - - // If it is a node unlikely to be stringify-ed or looped over - // use plain assignment - if ( owner.nodeType ) { - owner[ this.expando ] = value; - - // Otherwise secure it in a non-enumerable property - // configurable must be true to allow the property to be - // deleted when data is removed - } else { - Object.defineProperty( owner, this.expando, { - value: value, - configurable: true - } ); - } - } - } - - return value; - }, - set: function( owner, data, value ) { - var prop, - cache = this.cache( owner ); - - // Handle: [ owner, key, value ] args - // Always use camelCase key (gh-2257) - if ( typeof data === "string" ) { - cache[ camelCase( data ) ] = value; - - // Handle: [ owner, { properties } ] args - } else { - - // Copy the properties one-by-one to the cache object - for ( prop in data ) { - cache[ camelCase( prop ) ] = data[ prop ]; - } - } - return cache; - }, - get: function( owner, key ) { - return key === undefined ? - this.cache( owner ) : - - // Always use camelCase key (gh-2257) - owner[ this.expando ] && owner[ this.expando ][ camelCase( key ) ]; - }, - access: function( owner, key, value ) { - - // In cases where either: - // - // 1. No key was specified - // 2. A string key was specified, but no value provided - // - // Take the "read" path and allow the get method to determine - // which value to return, respectively either: - // - // 1. The entire cache object - // 2. The data stored at the key - // - if ( key === undefined || - ( ( key && typeof key === "string" ) && value === undefined ) ) { - - return this.get( owner, key ); - } - - // When the key is not a string, or both a key and value - // are specified, set or extend (existing objects) with either: - // - // 1. An object of properties - // 2. A key and value - // - this.set( owner, key, value ); - - // Since the "set" path can have two possible entry points - // return the expected data based on which path was taken[*] - return value !== undefined ? value : key; - }, - remove: function( owner, key ) { - var i, - cache = owner[ this.expando ]; - - if ( cache === undefined ) { - return; - } - - if ( key !== undefined ) { - - // Support array or space separated string of keys - if ( Array.isArray( key ) ) { - - // If key is an array of keys... - // We always set camelCase keys, so remove that. - key = key.map( camelCase ); - } else { - key = camelCase( key ); - - // If a key with the spaces exists, use it. - // Otherwise, create an array by matching non-whitespace - key = key in cache ? - [ key ] : - ( key.match( rnothtmlwhite ) || [] ); - } - - i = key.length; - - while ( i-- ) { - delete cache[ key[ i ] ]; - } - } - - // Remove the expando if there's no more data - if ( key === undefined || jQuery.isEmptyObject( cache ) ) { - - // Support: Chrome <=35 - 45 - // Webkit & Blink performance suffers when deleting properties - // from DOM nodes, so set to undefined instead - // https://bugs.chromium.org/p/chromium/issues/detail?id=378607 (bug restricted) - if ( owner.nodeType ) { - owner[ this.expando ] = undefined; - } else { - delete owner[ this.expando ]; - } - } - }, - hasData: function( owner ) { - var cache = owner[ this.expando ]; - return cache !== undefined && !jQuery.isEmptyObject( cache ); - } -}; -var dataPriv = new Data(); - -var dataUser = new Data(); - - - -// Implementation Summary -// -// 1. Enforce API surface and semantic compatibility with 1.9.x branch -// 2. Improve the module's maintainability by reducing the storage -// paths to a single mechanism. -// 3. Use the same single mechanism to support "private" and "user" data. -// 4. _Never_ expose "private" data to user code (TODO: Drop _data, _removeData) -// 5. Avoid exposing implementation details on user objects (eg. expando properties) -// 6. Provide a clear path for implementation upgrade to WeakMap in 2014 - -var rbrace = /^(?:\{[\w\W]*\}|\[[\w\W]*\])$/, - rmultiDash = /[A-Z]/g; - -function getData( data ) { - if ( data === "true" ) { - return true; - } - - if ( data === "false" ) { - return false; - } - - if ( data === "null" ) { - return null; - } - - // Only convert to a number if it doesn't change the string - if ( data === +data + "" ) { - return +data; - } - - if ( rbrace.test( data ) ) { - return JSON.parse( data ); - } - - return data; -} - -function dataAttr( elem, key, data ) { - var name; - - // If nothing was found internally, try to fetch any - // data from the HTML5 data-* attribute - if ( data === undefined && elem.nodeType === 1 ) { - name = "data-" + key.replace( rmultiDash, "-$&" ).toLowerCase(); - data = elem.getAttribute( name ); - - if ( typeof data === "string" ) { - try { - data = getData( data ); - } catch ( e ) {} - - // Make sure we set the data so it isn't changed later - dataUser.set( elem, key, data ); - } else { - data = undefined; - } - } - return data; -} - -jQuery.extend( { - hasData: function( elem ) { - return dataUser.hasData( elem ) || dataPriv.hasData( elem ); - }, - - data: function( elem, name, data ) { - return dataUser.access( elem, name, data ); - }, - - removeData: function( elem, name ) { - dataUser.remove( elem, name ); - }, - - // TODO: Now that all calls to _data and _removeData have been replaced - // with direct calls to dataPriv methods, these can be deprecated. - _data: function( elem, name, data ) { - return dataPriv.access( elem, name, data ); - }, - - _removeData: function( elem, name ) { - dataPriv.remove( elem, name ); - } -} ); - -jQuery.fn.extend( { - data: function( key, value ) { - var i, name, data, - elem = this[ 0 ], - attrs = elem && elem.attributes; - - // Gets all values - if ( key === undefined ) { - if ( this.length ) { - data = dataUser.get( elem ); - - if ( elem.nodeType === 1 && !dataPriv.get( elem, "hasDataAttrs" ) ) { - i = attrs.length; - while ( i-- ) { - - // Support: IE 11 only - // The attrs elements can be null (#14894) - if ( attrs[ i ] ) { - name = attrs[ i ].name; - if ( name.indexOf( "data-" ) === 0 ) { - name = camelCase( name.slice( 5 ) ); - dataAttr( elem, name, data[ name ] ); - } - } - } - dataPriv.set( elem, "hasDataAttrs", true ); - } - } - - return data; - } - - // Sets multiple values - if ( typeof key === "object" ) { - return this.each( function() { - dataUser.set( this, key ); - } ); - } - - return access( this, function( value ) { - var data; - - // The calling jQuery object (element matches) is not empty - // (and therefore has an element appears at this[ 0 ]) and the - // `value` parameter was not undefined. An empty jQuery object - // will result in `undefined` for elem = this[ 0 ] which will - // throw an exception if an attempt to read a data cache is made. - if ( elem && value === undefined ) { - - // Attempt to get data from the cache - // The key will always be camelCased in Data - data = dataUser.get( elem, key ); - if ( data !== undefined ) { - return data; - } - - // Attempt to "discover" the data in - // HTML5 custom data-* attrs - data = dataAttr( elem, key ); - if ( data !== undefined ) { - return data; - } - - // We tried really hard, but the data doesn't exist. - return; - } - - // Set the data... - this.each( function() { - - // We always store the camelCased key - dataUser.set( this, key, value ); - } ); - }, null, value, arguments.length > 1, null, true ); - }, - - removeData: function( key ) { - return this.each( function() { - dataUser.remove( this, key ); - } ); - } -} ); - - -jQuery.extend( { - queue: function( elem, type, data ) { - var queue; - - if ( elem ) { - type = ( type || "fx" ) + "queue"; - queue = dataPriv.get( elem, type ); - - // Speed up dequeue by getting out quickly if this is just a lookup - if ( data ) { - if ( !queue || Array.isArray( data ) ) { - queue = dataPriv.access( elem, type, jQuery.makeArray( data ) ); - } else { - queue.push( data ); - } - } - return queue || []; - } - }, - - dequeue: function( elem, type ) { - type = type || "fx"; - - var queue = jQuery.queue( elem, type ), - startLength = queue.length, - fn = queue.shift(), - hooks = jQuery._queueHooks( elem, type ), - next = function() { - jQuery.dequeue( elem, type ); - }; - - // If the fx queue is dequeued, always remove the progress sentinel - if ( fn === "inprogress" ) { - fn = queue.shift(); - startLength--; - } - - if ( fn ) { - - // Add a progress sentinel to prevent the fx queue from being - // automatically dequeued - if ( type === "fx" ) { - queue.unshift( "inprogress" ); - } - - // Clear up the last queue stop function - delete hooks.stop; - fn.call( elem, next, hooks ); - } - - if ( !startLength && hooks ) { - hooks.empty.fire(); - } - }, - - // Not public - generate a queueHooks object, or return the current one - _queueHooks: function( elem, type ) { - var key = type + "queueHooks"; - return dataPriv.get( elem, key ) || dataPriv.access( elem, key, { - empty: jQuery.Callbacks( "once memory" ).add( function() { - dataPriv.remove( elem, [ type + "queue", key ] ); - } ) - } ); - } -} ); - -jQuery.fn.extend( { - queue: function( type, data ) { - var setter = 2; - - if ( typeof type !== "string" ) { - data = type; - type = "fx"; - setter--; - } - - if ( arguments.length < setter ) { - return jQuery.queue( this[ 0 ], type ); - } - - return data === undefined ? - this : - this.each( function() { - var queue = jQuery.queue( this, type, data ); - - // Ensure a hooks for this queue - jQuery._queueHooks( this, type ); - - if ( type === "fx" && queue[ 0 ] !== "inprogress" ) { - jQuery.dequeue( this, type ); - } - } ); - }, - dequeue: function( type ) { - return this.each( function() { - jQuery.dequeue( this, type ); - } ); - }, - clearQueue: function( type ) { - return this.queue( type || "fx", [] ); - }, - - // Get a promise resolved when queues of a certain type - // are emptied (fx is the type by default) - promise: function( type, obj ) { - var tmp, - count = 1, - defer = jQuery.Deferred(), - elements = this, - i = this.length, - resolve = function() { - if ( !( --count ) ) { - defer.resolveWith( elements, [ elements ] ); - } - }; - - if ( typeof type !== "string" ) { - obj = type; - type = undefined; - } - type = type || "fx"; - - while ( i-- ) { - tmp = dataPriv.get( elements[ i ], type + "queueHooks" ); - if ( tmp && tmp.empty ) { - count++; - tmp.empty.add( resolve ); - } - } - resolve(); - return defer.promise( obj ); - } -} ); -var pnum = ( /[+-]?(?:\d*\.|)\d+(?:[eE][+-]?\d+|)/ ).source; - -var rcssNum = new RegExp( "^(?:([+-])=|)(" + pnum + ")([a-z%]*)$", "i" ); - - -var cssExpand = [ "Top", "Right", "Bottom", "Left" ]; - -var documentElement = document.documentElement; - - - - var isAttached = function( elem ) { - return jQuery.contains( elem.ownerDocument, elem ); - }, - composed = { composed: true }; - - // Support: IE 9 - 11+, Edge 12 - 18+, iOS 10.0 - 10.2 only - // Check attachment across shadow DOM boundaries when possible (gh-3504) - // Support: iOS 10.0-10.2 only - // Early iOS 10 versions support `attachShadow` but not `getRootNode`, - // leading to errors. We need to check for `getRootNode`. - if ( documentElement.getRootNode ) { - isAttached = function( elem ) { - return jQuery.contains( elem.ownerDocument, elem ) || - elem.getRootNode( composed ) === elem.ownerDocument; - }; - } -var isHiddenWithinTree = function( elem, el ) { - - // isHiddenWithinTree might be called from jQuery#filter function; - // in that case, element will be second argument - elem = el || elem; - - // Inline style trumps all - return elem.style.display === "none" || - elem.style.display === "" && - - // Otherwise, check computed style - // Support: Firefox <=43 - 45 - // Disconnected elements can have computed display: none, so first confirm that elem is - // in the document. - isAttached( elem ) && - - jQuery.css( elem, "display" ) === "none"; - }; - - - -function adjustCSS( elem, prop, valueParts, tween ) { - var adjusted, scale, - maxIterations = 20, - currentValue = tween ? - function() { - return tween.cur(); - } : - function() { - return jQuery.css( elem, prop, "" ); - }, - initial = currentValue(), - unit = valueParts && valueParts[ 3 ] || ( jQuery.cssNumber[ prop ] ? "" : "px" ), - - // Starting value computation is required for potential unit mismatches - initialInUnit = elem.nodeType && - ( jQuery.cssNumber[ prop ] || unit !== "px" && +initial ) && - rcssNum.exec( jQuery.css( elem, prop ) ); - - if ( initialInUnit && initialInUnit[ 3 ] !== unit ) { - - // Support: Firefox <=54 - // Halve the iteration target value to prevent interference from CSS upper bounds (gh-2144) - initial = initial / 2; - - // Trust units reported by jQuery.css - unit = unit || initialInUnit[ 3 ]; - - // Iteratively approximate from a nonzero starting point - initialInUnit = +initial || 1; - - while ( maxIterations-- ) { - - // Evaluate and update our best guess (doubling guesses that zero out). - // Finish if the scale equals or crosses 1 (making the old*new product non-positive). - jQuery.style( elem, prop, initialInUnit + unit ); - if ( ( 1 - scale ) * ( 1 - ( scale = currentValue() / initial || 0.5 ) ) <= 0 ) { - maxIterations = 0; - } - initialInUnit = initialInUnit / scale; - - } - - initialInUnit = initialInUnit * 2; - jQuery.style( elem, prop, initialInUnit + unit ); - - // Make sure we update the tween properties later on - valueParts = valueParts || []; - } - - if ( valueParts ) { - initialInUnit = +initialInUnit || +initial || 0; - - // Apply relative offset (+=/-=) if specified - adjusted = valueParts[ 1 ] ? - initialInUnit + ( valueParts[ 1 ] + 1 ) * valueParts[ 2 ] : - +valueParts[ 2 ]; - if ( tween ) { - tween.unit = unit; - tween.start = initialInUnit; - tween.end = adjusted; - } - } - return adjusted; -} - - -var defaultDisplayMap = {}; - -function getDefaultDisplay( elem ) { - var temp, - doc = elem.ownerDocument, - nodeName = elem.nodeName, - display = defaultDisplayMap[ nodeName ]; - - if ( display ) { - return display; - } - - temp = doc.body.appendChild( doc.createElement( nodeName ) ); - display = jQuery.css( temp, "display" ); - - temp.parentNode.removeChild( temp ); - - if ( display === "none" ) { - display = "block"; - } - defaultDisplayMap[ nodeName ] = display; - - return display; -} - -function showHide( elements, show ) { - var display, elem, - values = [], - index = 0, - length = elements.length; - - // Determine new display value for elements that need to change - for ( ; index < length; index++ ) { - elem = elements[ index ]; - if ( !elem.style ) { - continue; - } - - display = elem.style.display; - if ( show ) { - - // Since we force visibility upon cascade-hidden elements, an immediate (and slow) - // check is required in this first loop unless we have a nonempty display value (either - // inline or about-to-be-restored) - if ( display === "none" ) { - values[ index ] = dataPriv.get( elem, "display" ) || null; - if ( !values[ index ] ) { - elem.style.display = ""; - } - } - if ( elem.style.display === "" && isHiddenWithinTree( elem ) ) { - values[ index ] = getDefaultDisplay( elem ); - } - } else { - if ( display !== "none" ) { - values[ index ] = "none"; - - // Remember what we're overwriting - dataPriv.set( elem, "display", display ); - } - } - } - - // Set the display of the elements in a second loop to avoid constant reflow - for ( index = 0; index < length; index++ ) { - if ( values[ index ] != null ) { - elements[ index ].style.display = values[ index ]; - } - } - - return elements; -} - -jQuery.fn.extend( { - show: function() { - return showHide( this, true ); - }, - hide: function() { - return showHide( this ); - }, - toggle: function( state ) { - if ( typeof state === "boolean" ) { - return state ? this.show() : this.hide(); - } - - return this.each( function() { - if ( isHiddenWithinTree( this ) ) { - jQuery( this ).show(); - } else { - jQuery( this ).hide(); - } - } ); - } -} ); -var rcheckableType = ( /^(?:checkbox|radio)$/i ); - -var rtagName = ( /<([a-z][^\/\0>\x20\t\r\n\f]*)/i ); - -var rscriptType = ( /^$|^module$|\/(?:java|ecma)script/i ); - - - -( function() { - var fragment = document.createDocumentFragment(), - div = fragment.appendChild( document.createElement( "div" ) ), - input = document.createElement( "input" ); - - // Support: Android 4.0 - 4.3 only - // Check state lost if the name is set (#11217) - // Support: Windows Web Apps (WWA) - // `name` and `type` must use .setAttribute for WWA (#14901) - input.setAttribute( "type", "radio" ); - input.setAttribute( "checked", "checked" ); - input.setAttribute( "name", "t" ); - - div.appendChild( input ); - - // Support: Android <=4.1 only - // Older WebKit doesn't clone checked state correctly in fragments - support.checkClone = div.cloneNode( true ).cloneNode( true ).lastChild.checked; - - // Support: IE <=11 only - // Make sure textarea (and checkbox) defaultValue is properly cloned - div.innerHTML = ""; - support.noCloneChecked = !!div.cloneNode( true ).lastChild.defaultValue; - - // Support: IE <=9 only - // IE <=9 replaces "; - support.option = !!div.lastChild; -} )(); - - -// We have to close these tags to support XHTML (#13200) -var wrapMap = { - - // XHTML parsers do not magically insert elements in the - // same way that tag soup parsers do. So we cannot shorten - // this by omitting or other required elements. - thead: [ 1, "", "
" ], - col: [ 2, "", "
" ], - tr: [ 2, "", "
" ], - td: [ 3, "", "
" ], - - _default: [ 0, "", "" ] -}; - -wrapMap.tbody = wrapMap.tfoot = wrapMap.colgroup = wrapMap.caption = wrapMap.thead; -wrapMap.th = wrapMap.td; - -// Support: IE <=9 only -if ( !support.option ) { - wrapMap.optgroup = wrapMap.option = [ 1, "" ]; -} - - -function getAll( context, tag ) { - - // Support: IE <=9 - 11 only - // Use typeof to avoid zero-argument method invocation on host objects (#15151) - var ret; - - if ( typeof context.getElementsByTagName !== "undefined" ) { - ret = context.getElementsByTagName( tag || "*" ); - - } else if ( typeof context.querySelectorAll !== "undefined" ) { - ret = context.querySelectorAll( tag || "*" ); - - } else { - ret = []; - } - - if ( tag === undefined || tag && nodeName( context, tag ) ) { - return jQuery.merge( [ context ], ret ); - } - - return ret; -} - - -// Mark scripts as having already been evaluated -function setGlobalEval( elems, refElements ) { - var i = 0, - l = elems.length; - - for ( ; i < l; i++ ) { - dataPriv.set( - elems[ i ], - "globalEval", - !refElements || dataPriv.get( refElements[ i ], "globalEval" ) - ); - } -} - - -var rhtml = /<|&#?\w+;/; - -function buildFragment( elems, context, scripts, selection, ignored ) { - var elem, tmp, tag, wrap, attached, j, - fragment = context.createDocumentFragment(), - nodes = [], - i = 0, - l = elems.length; - - for ( ; i < l; i++ ) { - elem = elems[ i ]; - - if ( elem || elem === 0 ) { - - // Add nodes directly - if ( toType( elem ) === "object" ) { - - // Support: Android <=4.0 only, PhantomJS 1 only - // push.apply(_, arraylike) throws on ancient WebKit - jQuery.merge( nodes, elem.nodeType ? [ elem ] : elem ); - - // Convert non-html into a text node - } else if ( !rhtml.test( elem ) ) { - nodes.push( context.createTextNode( elem ) ); - - // Convert html into DOM nodes - } else { - tmp = tmp || fragment.appendChild( context.createElement( "div" ) ); - - // Deserialize a standard representation - tag = ( rtagName.exec( elem ) || [ "", "" ] )[ 1 ].toLowerCase(); - wrap = wrapMap[ tag ] || wrapMap._default; - tmp.innerHTML = wrap[ 1 ] + jQuery.htmlPrefilter( elem ) + wrap[ 2 ]; - - // Descend through wrappers to the right content - j = wrap[ 0 ]; - while ( j-- ) { - tmp = tmp.lastChild; - } - - // Support: Android <=4.0 only, PhantomJS 1 only - // push.apply(_, arraylike) throws on ancient WebKit - jQuery.merge( nodes, tmp.childNodes ); - - // Remember the top-level container - tmp = fragment.firstChild; - - // Ensure the created nodes are orphaned (#12392) - tmp.textContent = ""; - } - } - } - - // Remove wrapper from fragment - fragment.textContent = ""; - - i = 0; - while ( ( elem = nodes[ i++ ] ) ) { - - // Skip elements already in the context collection (trac-4087) - if ( selection && jQuery.inArray( elem, selection ) > -1 ) { - if ( ignored ) { - ignored.push( elem ); - } - continue; - } - - attached = isAttached( elem ); - - // Append to fragment - tmp = getAll( fragment.appendChild( elem ), "script" ); - - // Preserve script evaluation history - if ( attached ) { - setGlobalEval( tmp ); - } - - // Capture executables - if ( scripts ) { - j = 0; - while ( ( elem = tmp[ j++ ] ) ) { - if ( rscriptType.test( elem.type || "" ) ) { - scripts.push( elem ); - } - } - } - } - - return fragment; -} - - -var - rkeyEvent = /^key/, - rmouseEvent = /^(?:mouse|pointer|contextmenu|drag|drop)|click/, - rtypenamespace = /^([^.]*)(?:\.(.+)|)/; - -function returnTrue() { - return true; -} - -function returnFalse() { - return false; -} - -// Support: IE <=9 - 11+ -// focus() and blur() are asynchronous, except when they are no-op. -// So expect focus to be synchronous when the element is already active, -// and blur to be synchronous when the element is not already active. -// (focus and blur are always synchronous in other supported browsers, -// this just defines when we can count on it). -function expectSync( elem, type ) { - return ( elem === safeActiveElement() ) === ( type === "focus" ); -} - -// Support: IE <=9 only -// Accessing document.activeElement can throw unexpectedly -// https://bugs.jquery.com/ticket/13393 -function safeActiveElement() { - try { - return document.activeElement; - } catch ( err ) { } -} - -function on( elem, types, selector, data, fn, one ) { - var origFn, type; - - // Types can be a map of types/handlers - if ( typeof types === "object" ) { - - // ( types-Object, selector, data ) - if ( typeof selector !== "string" ) { - - // ( types-Object, data ) - data = data || selector; - selector = undefined; - } - for ( type in types ) { - on( elem, type, selector, data, types[ type ], one ); - } - return elem; - } - - if ( data == null && fn == null ) { - - // ( types, fn ) - fn = selector; - data = selector = undefined; - } else if ( fn == null ) { - if ( typeof selector === "string" ) { - - // ( types, selector, fn ) - fn = data; - data = undefined; - } else { - - // ( types, data, fn ) - fn = data; - data = selector; - selector = undefined; - } - } - if ( fn === false ) { - fn = returnFalse; - } else if ( !fn ) { - return elem; - } - - if ( one === 1 ) { - origFn = fn; - fn = function( event ) { - - // Can use an empty set, since event contains the info - jQuery().off( event ); - return origFn.apply( this, arguments ); - }; - - // Use same guid so caller can remove using origFn - fn.guid = origFn.guid || ( origFn.guid = jQuery.guid++ ); - } - return elem.each( function() { - jQuery.event.add( this, types, fn, data, selector ); - } ); -} - -/* - * Helper functions for managing events -- not part of the public interface. - * Props to Dean Edwards' addEvent library for many of the ideas. - */ -jQuery.event = { - - global: {}, - - add: function( elem, types, handler, data, selector ) { - - var handleObjIn, eventHandle, tmp, - events, t, handleObj, - special, handlers, type, namespaces, origType, - elemData = dataPriv.get( elem ); - - // Only attach events to objects that accept data - if ( !acceptData( elem ) ) { - return; - } - - // Caller can pass in an object of custom data in lieu of the handler - if ( handler.handler ) { - handleObjIn = handler; - handler = handleObjIn.handler; - selector = handleObjIn.selector; - } - - // Ensure that invalid selectors throw exceptions at attach time - // Evaluate against documentElement in case elem is a non-element node (e.g., document) - if ( selector ) { - jQuery.find.matchesSelector( documentElement, selector ); - } - - // Make sure that the handler has a unique ID, used to find/remove it later - if ( !handler.guid ) { - handler.guid = jQuery.guid++; - } - - // Init the element's event structure and main handler, if this is the first - if ( !( events = elemData.events ) ) { - events = elemData.events = Object.create( null ); - } - if ( !( eventHandle = elemData.handle ) ) { - eventHandle = elemData.handle = function( e ) { - - // Discard the second event of a jQuery.event.trigger() and - // when an event is called after a page has unloaded - return typeof jQuery !== "undefined" && jQuery.event.triggered !== e.type ? - jQuery.event.dispatch.apply( elem, arguments ) : undefined; - }; - } - - // Handle multiple events separated by a space - types = ( types || "" ).match( rnothtmlwhite ) || [ "" ]; - t = types.length; - while ( t-- ) { - tmp = rtypenamespace.exec( types[ t ] ) || []; - type = origType = tmp[ 1 ]; - namespaces = ( tmp[ 2 ] || "" ).split( "." ).sort(); - - // There *must* be a type, no attaching namespace-only handlers - if ( !type ) { - continue; - } - - // If event changes its type, use the special event handlers for the changed type - special = jQuery.event.special[ type ] || {}; - - // If selector defined, determine special event api type, otherwise given type - type = ( selector ? special.delegateType : special.bindType ) || type; - - // Update special based on newly reset type - special = jQuery.event.special[ type ] || {}; - - // handleObj is passed to all event handlers - handleObj = jQuery.extend( { - type: type, - origType: origType, - data: data, - handler: handler, - guid: handler.guid, - selector: selector, - needsContext: selector && jQuery.expr.match.needsContext.test( selector ), - namespace: namespaces.join( "." ) - }, handleObjIn ); - - // Init the event handler queue if we're the first - if ( !( handlers = events[ type ] ) ) { - handlers = events[ type ] = []; - handlers.delegateCount = 0; - - // Only use addEventListener if the special events handler returns false - if ( !special.setup || - special.setup.call( elem, data, namespaces, eventHandle ) === false ) { - - if ( elem.addEventListener ) { - elem.addEventListener( type, eventHandle ); - } - } - } - - if ( special.add ) { - special.add.call( elem, handleObj ); - - if ( !handleObj.handler.guid ) { - handleObj.handler.guid = handler.guid; - } - } - - // Add to the element's handler list, delegates in front - if ( selector ) { - handlers.splice( handlers.delegateCount++, 0, handleObj ); - } else { - handlers.push( handleObj ); - } - - // Keep track of which events have ever been used, for event optimization - jQuery.event.global[ type ] = true; - } - - }, - - // Detach an event or set of events from an element - remove: function( elem, types, handler, selector, mappedTypes ) { - - var j, origCount, tmp, - events, t, handleObj, - special, handlers, type, namespaces, origType, - elemData = dataPriv.hasData( elem ) && dataPriv.get( elem ); - - if ( !elemData || !( events = elemData.events ) ) { - return; - } - - // Once for each type.namespace in types; type may be omitted - types = ( types || "" ).match( rnothtmlwhite ) || [ "" ]; - t = types.length; - while ( t-- ) { - tmp = rtypenamespace.exec( types[ t ] ) || []; - type = origType = tmp[ 1 ]; - namespaces = ( tmp[ 2 ] || "" ).split( "." ).sort(); - - // Unbind all events (on this namespace, if provided) for the element - if ( !type ) { - for ( type in events ) { - jQuery.event.remove( elem, type + types[ t ], handler, selector, true ); - } - continue; - } - - special = jQuery.event.special[ type ] || {}; - type = ( selector ? special.delegateType : special.bindType ) || type; - handlers = events[ type ] || []; - tmp = tmp[ 2 ] && - new RegExp( "(^|\\.)" + namespaces.join( "\\.(?:.*\\.|)" ) + "(\\.|$)" ); - - // Remove matching events - origCount = j = handlers.length; - while ( j-- ) { - handleObj = handlers[ j ]; - - if ( ( mappedTypes || origType === handleObj.origType ) && - ( !handler || handler.guid === handleObj.guid ) && - ( !tmp || tmp.test( handleObj.namespace ) ) && - ( !selector || selector === handleObj.selector || - selector === "**" && handleObj.selector ) ) { - handlers.splice( j, 1 ); - - if ( handleObj.selector ) { - handlers.delegateCount--; - } - if ( special.remove ) { - special.remove.call( elem, handleObj ); - } - } - } - - // Remove generic event handler if we removed something and no more handlers exist - // (avoids potential for endless recursion during removal of special event handlers) - if ( origCount && !handlers.length ) { - if ( !special.teardown || - special.teardown.call( elem, namespaces, elemData.handle ) === false ) { - - jQuery.removeEvent( elem, type, elemData.handle ); - } - - delete events[ type ]; - } - } - - // Remove data and the expando if it's no longer used - if ( jQuery.isEmptyObject( events ) ) { - dataPriv.remove( elem, "handle events" ); - } - }, - - dispatch: function( nativeEvent ) { - - var i, j, ret, matched, handleObj, handlerQueue, - args = new Array( arguments.length ), - - // Make a writable jQuery.Event from the native event object - event = jQuery.event.fix( nativeEvent ), - - handlers = ( - dataPriv.get( this, "events" ) || Object.create( null ) - )[ event.type ] || [], - special = jQuery.event.special[ event.type ] || {}; - - // Use the fix-ed jQuery.Event rather than the (read-only) native event - args[ 0 ] = event; - - for ( i = 1; i < arguments.length; i++ ) { - args[ i ] = arguments[ i ]; - } - - event.delegateTarget = this; - - // Call the preDispatch hook for the mapped type, and let it bail if desired - if ( special.preDispatch && special.preDispatch.call( this, event ) === false ) { - return; - } - - // Determine handlers - handlerQueue = jQuery.event.handlers.call( this, event, handlers ); - - // Run delegates first; they may want to stop propagation beneath us - i = 0; - while ( ( matched = handlerQueue[ i++ ] ) && !event.isPropagationStopped() ) { - event.currentTarget = matched.elem; - - j = 0; - while ( ( handleObj = matched.handlers[ j++ ] ) && - !event.isImmediatePropagationStopped() ) { - - // If the event is namespaced, then each handler is only invoked if it is - // specially universal or its namespaces are a superset of the event's. - if ( !event.rnamespace || handleObj.namespace === false || - event.rnamespace.test( handleObj.namespace ) ) { - - event.handleObj = handleObj; - event.data = handleObj.data; - - ret = ( ( jQuery.event.special[ handleObj.origType ] || {} ).handle || - handleObj.handler ).apply( matched.elem, args ); - - if ( ret !== undefined ) { - if ( ( event.result = ret ) === false ) { - event.preventDefault(); - event.stopPropagation(); - } - } - } - } - } - - // Call the postDispatch hook for the mapped type - if ( special.postDispatch ) { - special.postDispatch.call( this, event ); - } - - return event.result; - }, - - handlers: function( event, handlers ) { - var i, handleObj, sel, matchedHandlers, matchedSelectors, - handlerQueue = [], - delegateCount = handlers.delegateCount, - cur = event.target; - - // Find delegate handlers - if ( delegateCount && - - // Support: IE <=9 - // Black-hole SVG instance trees (trac-13180) - cur.nodeType && - - // Support: Firefox <=42 - // Suppress spec-violating clicks indicating a non-primary pointer button (trac-3861) - // https://www.w3.org/TR/DOM-Level-3-Events/#event-type-click - // Support: IE 11 only - // ...but not arrow key "clicks" of radio inputs, which can have `button` -1 (gh-2343) - !( event.type === "click" && event.button >= 1 ) ) { - - for ( ; cur !== this; cur = cur.parentNode || this ) { - - // Don't check non-elements (#13208) - // Don't process clicks on disabled elements (#6911, #8165, #11382, #11764) - if ( cur.nodeType === 1 && !( event.type === "click" && cur.disabled === true ) ) { - matchedHandlers = []; - matchedSelectors = {}; - for ( i = 0; i < delegateCount; i++ ) { - handleObj = handlers[ i ]; - - // Don't conflict with Object.prototype properties (#13203) - sel = handleObj.selector + " "; - - if ( matchedSelectors[ sel ] === undefined ) { - matchedSelectors[ sel ] = handleObj.needsContext ? - jQuery( sel, this ).index( cur ) > -1 : - jQuery.find( sel, this, null, [ cur ] ).length; - } - if ( matchedSelectors[ sel ] ) { - matchedHandlers.push( handleObj ); - } - } - if ( matchedHandlers.length ) { - handlerQueue.push( { elem: cur, handlers: matchedHandlers } ); - } - } - } - } - - // Add the remaining (directly-bound) handlers - cur = this; - if ( delegateCount < handlers.length ) { - handlerQueue.push( { elem: cur, handlers: handlers.slice( delegateCount ) } ); - } - - return handlerQueue; - }, - - addProp: function( name, hook ) { - Object.defineProperty( jQuery.Event.prototype, name, { - enumerable: true, - configurable: true, - - get: isFunction( hook ) ? - function() { - if ( this.originalEvent ) { - return hook( this.originalEvent ); - } - } : - function() { - if ( this.originalEvent ) { - return this.originalEvent[ name ]; - } - }, - - set: function( value ) { - Object.defineProperty( this, name, { - enumerable: true, - configurable: true, - writable: true, - value: value - } ); - } - } ); - }, - - fix: function( originalEvent ) { - return originalEvent[ jQuery.expando ] ? - originalEvent : - new jQuery.Event( originalEvent ); - }, - - special: { - load: { - - // Prevent triggered image.load events from bubbling to window.load - noBubble: true - }, - click: { - - // Utilize native event to ensure correct state for checkable inputs - setup: function( data ) { - - // For mutual compressibility with _default, replace `this` access with a local var. - // `|| data` is dead code meant only to preserve the variable through minification. - var el = this || data; - - // Claim the first handler - if ( rcheckableType.test( el.type ) && - el.click && nodeName( el, "input" ) ) { - - // dataPriv.set( el, "click", ... ) - leverageNative( el, "click", returnTrue ); - } - - // Return false to allow normal processing in the caller - return false; - }, - trigger: function( data ) { - - // For mutual compressibility with _default, replace `this` access with a local var. - // `|| data` is dead code meant only to preserve the variable through minification. - var el = this || data; - - // Force setup before triggering a click - if ( rcheckableType.test( el.type ) && - el.click && nodeName( el, "input" ) ) { - - leverageNative( el, "click" ); - } - - // Return non-false to allow normal event-path propagation - return true; - }, - - // For cross-browser consistency, suppress native .click() on links - // Also prevent it if we're currently inside a leveraged native-event stack - _default: function( event ) { - var target = event.target; - return rcheckableType.test( target.type ) && - target.click && nodeName( target, "input" ) && - dataPriv.get( target, "click" ) || - nodeName( target, "a" ); - } - }, - - beforeunload: { - postDispatch: function( event ) { - - // Support: Firefox 20+ - // Firefox doesn't alert if the returnValue field is not set. - if ( event.result !== undefined && event.originalEvent ) { - event.originalEvent.returnValue = event.result; - } - } - } - } -}; - -// Ensure the presence of an event listener that handles manually-triggered -// synthetic events by interrupting progress until reinvoked in response to -// *native* events that it fires directly, ensuring that state changes have -// already occurred before other listeners are invoked. -function leverageNative( el, type, expectSync ) { - - // Missing expectSync indicates a trigger call, which must force setup through jQuery.event.add - if ( !expectSync ) { - if ( dataPriv.get( el, type ) === undefined ) { - jQuery.event.add( el, type, returnTrue ); - } - return; - } - - // Register the controller as a special universal handler for all event namespaces - dataPriv.set( el, type, false ); - jQuery.event.add( el, type, { - namespace: false, - handler: function( event ) { - var notAsync, result, - saved = dataPriv.get( this, type ); - - if ( ( event.isTrigger & 1 ) && this[ type ] ) { - - // Interrupt processing of the outer synthetic .trigger()ed event - // Saved data should be false in such cases, but might be a leftover capture object - // from an async native handler (gh-4350) - if ( !saved.length ) { - - // Store arguments for use when handling the inner native event - // There will always be at least one argument (an event object), so this array - // will not be confused with a leftover capture object. - saved = slice.call( arguments ); - dataPriv.set( this, type, saved ); - - // Trigger the native event and capture its result - // Support: IE <=9 - 11+ - // focus() and blur() are asynchronous - notAsync = expectSync( this, type ); - this[ type ](); - result = dataPriv.get( this, type ); - if ( saved !== result || notAsync ) { - dataPriv.set( this, type, false ); - } else { - result = {}; - } - if ( saved !== result ) { - - // Cancel the outer synthetic event - event.stopImmediatePropagation(); - event.preventDefault(); - return result.value; - } - - // If this is an inner synthetic event for an event with a bubbling surrogate - // (focus or blur), assume that the surrogate already propagated from triggering the - // native event and prevent that from happening again here. - // This technically gets the ordering wrong w.r.t. to `.trigger()` (in which the - // bubbling surrogate propagates *after* the non-bubbling base), but that seems - // less bad than duplication. - } else if ( ( jQuery.event.special[ type ] || {} ).delegateType ) { - event.stopPropagation(); - } - - // If this is a native event triggered above, everything is now in order - // Fire an inner synthetic event with the original arguments - } else if ( saved.length ) { - - // ...and capture the result - dataPriv.set( this, type, { - value: jQuery.event.trigger( - - // Support: IE <=9 - 11+ - // Extend with the prototype to reset the above stopImmediatePropagation() - jQuery.extend( saved[ 0 ], jQuery.Event.prototype ), - saved.slice( 1 ), - this - ) - } ); - - // Abort handling of the native event - event.stopImmediatePropagation(); - } - } - } ); -} - -jQuery.removeEvent = function( elem, type, handle ) { - - // This "if" is needed for plain objects - if ( elem.removeEventListener ) { - elem.removeEventListener( type, handle ); - } -}; - -jQuery.Event = function( src, props ) { - - // Allow instantiation without the 'new' keyword - if ( !( this instanceof jQuery.Event ) ) { - return new jQuery.Event( src, props ); - } - - // Event object - if ( src && src.type ) { - this.originalEvent = src; - this.type = src.type; - - // Events bubbling up the document may have been marked as prevented - // by a handler lower down the tree; reflect the correct value. - this.isDefaultPrevented = src.defaultPrevented || - src.defaultPrevented === undefined && - - // Support: Android <=2.3 only - src.returnValue === false ? - returnTrue : - returnFalse; - - // Create target properties - // Support: Safari <=6 - 7 only - // Target should not be a text node (#504, #13143) - this.target = ( src.target && src.target.nodeType === 3 ) ? - src.target.parentNode : - src.target; - - this.currentTarget = src.currentTarget; - this.relatedTarget = src.relatedTarget; - - // Event type - } else { - this.type = src; - } - - // Put explicitly provided properties onto the event object - if ( props ) { - jQuery.extend( this, props ); - } - - // Create a timestamp if incoming event doesn't have one - this.timeStamp = src && src.timeStamp || Date.now(); - - // Mark it as fixed - this[ jQuery.expando ] = true; -}; - -// jQuery.Event is based on DOM3 Events as specified by the ECMAScript Language Binding -// https://www.w3.org/TR/2003/WD-DOM-Level-3-Events-20030331/ecma-script-binding.html -jQuery.Event.prototype = { - constructor: jQuery.Event, - isDefaultPrevented: returnFalse, - isPropagationStopped: returnFalse, - isImmediatePropagationStopped: returnFalse, - isSimulated: false, - - preventDefault: function() { - var e = this.originalEvent; - - this.isDefaultPrevented = returnTrue; - - if ( e && !this.isSimulated ) { - e.preventDefault(); - } - }, - stopPropagation: function() { - var e = this.originalEvent; - - this.isPropagationStopped = returnTrue; - - if ( e && !this.isSimulated ) { - e.stopPropagation(); - } - }, - stopImmediatePropagation: function() { - var e = this.originalEvent; - - this.isImmediatePropagationStopped = returnTrue; - - if ( e && !this.isSimulated ) { - e.stopImmediatePropagation(); - } - - this.stopPropagation(); - } -}; - -// Includes all common event props including KeyEvent and MouseEvent specific props -jQuery.each( { - altKey: true, - bubbles: true, - cancelable: true, - changedTouches: true, - ctrlKey: true, - detail: true, - eventPhase: true, - metaKey: true, - pageX: true, - pageY: true, - shiftKey: true, - view: true, - "char": true, - code: true, - charCode: true, - key: true, - keyCode: true, - button: true, - buttons: true, - clientX: true, - clientY: true, - offsetX: true, - offsetY: true, - pointerId: true, - pointerType: true, - screenX: true, - screenY: true, - targetTouches: true, - toElement: true, - touches: true, - - which: function( event ) { - var button = event.button; - - // Add which for key events - if ( event.which == null && rkeyEvent.test( event.type ) ) { - return event.charCode != null ? event.charCode : event.keyCode; - } - - // Add which for click: 1 === left; 2 === middle; 3 === right - if ( !event.which && button !== undefined && rmouseEvent.test( event.type ) ) { - if ( button & 1 ) { - return 1; - } - - if ( button & 2 ) { - return 3; - } - - if ( button & 4 ) { - return 2; - } - - return 0; - } - - return event.which; - } -}, jQuery.event.addProp ); - -jQuery.each( { focus: "focusin", blur: "focusout" }, function( type, delegateType ) { - jQuery.event.special[ type ] = { - - // Utilize native event if possible so blur/focus sequence is correct - setup: function() { - - // Claim the first handler - // dataPriv.set( this, "focus", ... ) - // dataPriv.set( this, "blur", ... ) - leverageNative( this, type, expectSync ); - - // Return false to allow normal processing in the caller - return false; - }, - trigger: function() { - - // Force setup before trigger - leverageNative( this, type ); - - // Return non-false to allow normal event-path propagation - return true; - }, - - delegateType: delegateType - }; -} ); - -// Create mouseenter/leave events using mouseover/out and event-time checks -// so that event delegation works in jQuery. -// Do the same for pointerenter/pointerleave and pointerover/pointerout -// -// Support: Safari 7 only -// Safari sends mouseenter too often; see: -// https://bugs.chromium.org/p/chromium/issues/detail?id=470258 -// for the description of the bug (it existed in older Chrome versions as well). -jQuery.each( { - mouseenter: "mouseover", - mouseleave: "mouseout", - pointerenter: "pointerover", - pointerleave: "pointerout" -}, function( orig, fix ) { - jQuery.event.special[ orig ] = { - delegateType: fix, - bindType: fix, - - handle: function( event ) { - var ret, - target = this, - related = event.relatedTarget, - handleObj = event.handleObj; - - // For mouseenter/leave call the handler if related is outside the target. - // NB: No relatedTarget if the mouse left/entered the browser window - if ( !related || ( related !== target && !jQuery.contains( target, related ) ) ) { - event.type = handleObj.origType; - ret = handleObj.handler.apply( this, arguments ); - event.type = fix; - } - return ret; - } - }; -} ); - -jQuery.fn.extend( { - - on: function( types, selector, data, fn ) { - return on( this, types, selector, data, fn ); - }, - one: function( types, selector, data, fn ) { - return on( this, types, selector, data, fn, 1 ); - }, - off: function( types, selector, fn ) { - var handleObj, type; - if ( types && types.preventDefault && types.handleObj ) { - - // ( event ) dispatched jQuery.Event - handleObj = types.handleObj; - jQuery( types.delegateTarget ).off( - handleObj.namespace ? - handleObj.origType + "." + handleObj.namespace : - handleObj.origType, - handleObj.selector, - handleObj.handler - ); - return this; - } - if ( typeof types === "object" ) { - - // ( types-object [, selector] ) - for ( type in types ) { - this.off( type, selector, types[ type ] ); - } - return this; - } - if ( selector === false || typeof selector === "function" ) { - - // ( types [, fn] ) - fn = selector; - selector = undefined; - } - if ( fn === false ) { - fn = returnFalse; - } - return this.each( function() { - jQuery.event.remove( this, types, fn, selector ); - } ); - } -} ); - - -var - - // Support: IE <=10 - 11, Edge 12 - 13 only - // In IE/Edge using regex groups here causes severe slowdowns. - // See https://connect.microsoft.com/IE/feedback/details/1736512/ - rnoInnerhtml = /\s*$/g; - -// Prefer a tbody over its parent table for containing new rows -function manipulationTarget( elem, content ) { - if ( nodeName( elem, "table" ) && - nodeName( content.nodeType !== 11 ? content : content.firstChild, "tr" ) ) { - - return jQuery( elem ).children( "tbody" )[ 0 ] || elem; - } - - return elem; -} - -// Replace/restore the type attribute of script elements for safe DOM manipulation -function disableScript( elem ) { - elem.type = ( elem.getAttribute( "type" ) !== null ) + "/" + elem.type; - return elem; -} -function restoreScript( elem ) { - if ( ( elem.type || "" ).slice( 0, 5 ) === "true/" ) { - elem.type = elem.type.slice( 5 ); - } else { - elem.removeAttribute( "type" ); - } - - return elem; -} - -function cloneCopyEvent( src, dest ) { - var i, l, type, pdataOld, udataOld, udataCur, events; - - if ( dest.nodeType !== 1 ) { - return; - } - - // 1. Copy private data: events, handlers, etc. - if ( dataPriv.hasData( src ) ) { - pdataOld = dataPriv.get( src ); - events = pdataOld.events; - - if ( events ) { - dataPriv.remove( dest, "handle events" ); - - for ( type in events ) { - for ( i = 0, l = events[ type ].length; i < l; i++ ) { - jQuery.event.add( dest, type, events[ type ][ i ] ); - } - } - } - } - - // 2. Copy user data - if ( dataUser.hasData( src ) ) { - udataOld = dataUser.access( src ); - udataCur = jQuery.extend( {}, udataOld ); - - dataUser.set( dest, udataCur ); - } -} - -// Fix IE bugs, see support tests -function fixInput( src, dest ) { - var nodeName = dest.nodeName.toLowerCase(); - - // Fails to persist the checked state of a cloned checkbox or radio button. - if ( nodeName === "input" && rcheckableType.test( src.type ) ) { - dest.checked = src.checked; - - // Fails to return the selected option to the default selected state when cloning options - } else if ( nodeName === "input" || nodeName === "textarea" ) { - dest.defaultValue = src.defaultValue; - } -} - -function domManip( collection, args, callback, ignored ) { - - // Flatten any nested arrays - args = flat( args ); - - var fragment, first, scripts, hasScripts, node, doc, - i = 0, - l = collection.length, - iNoClone = l - 1, - value = args[ 0 ], - valueIsFunction = isFunction( value ); - - // We can't cloneNode fragments that contain checked, in WebKit - if ( valueIsFunction || - ( l > 1 && typeof value === "string" && - !support.checkClone && rchecked.test( value ) ) ) { - return collection.each( function( index ) { - var self = collection.eq( index ); - if ( valueIsFunction ) { - args[ 0 ] = value.call( this, index, self.html() ); - } - domManip( self, args, callback, ignored ); - } ); - } - - if ( l ) { - fragment = buildFragment( args, collection[ 0 ].ownerDocument, false, collection, ignored ); - first = fragment.firstChild; - - if ( fragment.childNodes.length === 1 ) { - fragment = first; - } - - // Require either new content or an interest in ignored elements to invoke the callback - if ( first || ignored ) { - scripts = jQuery.map( getAll( fragment, "script" ), disableScript ); - hasScripts = scripts.length; - - // Use the original fragment for the last item - // instead of the first because it can end up - // being emptied incorrectly in certain situations (#8070). - for ( ; i < l; i++ ) { - node = fragment; - - if ( i !== iNoClone ) { - node = jQuery.clone( node, true, true ); - - // Keep references to cloned scripts for later restoration - if ( hasScripts ) { - - // Support: Android <=4.0 only, PhantomJS 1 only - // push.apply(_, arraylike) throws on ancient WebKit - jQuery.merge( scripts, getAll( node, "script" ) ); - } - } - - callback.call( collection[ i ], node, i ); - } - - if ( hasScripts ) { - doc = scripts[ scripts.length - 1 ].ownerDocument; - - // Reenable scripts - jQuery.map( scripts, restoreScript ); - - // Evaluate executable scripts on first document insertion - for ( i = 0; i < hasScripts; i++ ) { - node = scripts[ i ]; - if ( rscriptType.test( node.type || "" ) && - !dataPriv.access( node, "globalEval" ) && - jQuery.contains( doc, node ) ) { - - if ( node.src && ( node.type || "" ).toLowerCase() !== "module" ) { - - // Optional AJAX dependency, but won't run scripts if not present - if ( jQuery._evalUrl && !node.noModule ) { - jQuery._evalUrl( node.src, { - nonce: node.nonce || node.getAttribute( "nonce" ) - }, doc ); - } - } else { - DOMEval( node.textContent.replace( rcleanScript, "" ), node, doc ); - } - } - } - } - } - } - - return collection; -} - -function remove( elem, selector, keepData ) { - var node, - nodes = selector ? jQuery.filter( selector, elem ) : elem, - i = 0; - - for ( ; ( node = nodes[ i ] ) != null; i++ ) { - if ( !keepData && node.nodeType === 1 ) { - jQuery.cleanData( getAll( node ) ); - } - - if ( node.parentNode ) { - if ( keepData && isAttached( node ) ) { - setGlobalEval( getAll( node, "script" ) ); - } - node.parentNode.removeChild( node ); - } - } - - return elem; -} - -jQuery.extend( { - htmlPrefilter: function( html ) { - return html; - }, - - clone: function( elem, dataAndEvents, deepDataAndEvents ) { - var i, l, srcElements, destElements, - clone = elem.cloneNode( true ), - inPage = isAttached( elem ); - - // Fix IE cloning issues - if ( !support.noCloneChecked && ( elem.nodeType === 1 || elem.nodeType === 11 ) && - !jQuery.isXMLDoc( elem ) ) { - - // We eschew Sizzle here for performance reasons: https://jsperf.com/getall-vs-sizzle/2 - destElements = getAll( clone ); - srcElements = getAll( elem ); - - for ( i = 0, l = srcElements.length; i < l; i++ ) { - fixInput( srcElements[ i ], destElements[ i ] ); - } - } - - // Copy the events from the original to the clone - if ( dataAndEvents ) { - if ( deepDataAndEvents ) { - srcElements = srcElements || getAll( elem ); - destElements = destElements || getAll( clone ); - - for ( i = 0, l = srcElements.length; i < l; i++ ) { - cloneCopyEvent( srcElements[ i ], destElements[ i ] ); - } - } else { - cloneCopyEvent( elem, clone ); - } - } - - // Preserve script evaluation history - destElements = getAll( clone, "script" ); - if ( destElements.length > 0 ) { - setGlobalEval( destElements, !inPage && getAll( elem, "script" ) ); - } - - // Return the cloned set - return clone; - }, - - cleanData: function( elems ) { - var data, elem, type, - special = jQuery.event.special, - i = 0; - - for ( ; ( elem = elems[ i ] ) !== undefined; i++ ) { - if ( acceptData( elem ) ) { - if ( ( data = elem[ dataPriv.expando ] ) ) { - if ( data.events ) { - for ( type in data.events ) { - if ( special[ type ] ) { - jQuery.event.remove( elem, type ); - - // This is a shortcut to avoid jQuery.event.remove's overhead - } else { - jQuery.removeEvent( elem, type, data.handle ); - } - } - } - - // Support: Chrome <=35 - 45+ - // Assign undefined instead of using delete, see Data#remove - elem[ dataPriv.expando ] = undefined; - } - if ( elem[ dataUser.expando ] ) { - - // Support: Chrome <=35 - 45+ - // Assign undefined instead of using delete, see Data#remove - elem[ dataUser.expando ] = undefined; - } - } - } - } -} ); - -jQuery.fn.extend( { - detach: function( selector ) { - return remove( this, selector, true ); - }, - - remove: function( selector ) { - return remove( this, selector ); - }, - - text: function( value ) { - return access( this, function( value ) { - return value === undefined ? - jQuery.text( this ) : - this.empty().each( function() { - if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { - this.textContent = value; - } - } ); - }, null, value, arguments.length ); - }, - - append: function() { - return domManip( this, arguments, function( elem ) { - if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { - var target = manipulationTarget( this, elem ); - target.appendChild( elem ); - } - } ); - }, - - prepend: function() { - return domManip( this, arguments, function( elem ) { - if ( this.nodeType === 1 || this.nodeType === 11 || this.nodeType === 9 ) { - var target = manipulationTarget( this, elem ); - target.insertBefore( elem, target.firstChild ); - } - } ); - }, - - before: function() { - return domManip( this, arguments, function( elem ) { - if ( this.parentNode ) { - this.parentNode.insertBefore( elem, this ); - } - } ); - }, - - after: function() { - return domManip( this, arguments, function( elem ) { - if ( this.parentNode ) { - this.parentNode.insertBefore( elem, this.nextSibling ); - } - } ); - }, - - empty: function() { - var elem, - i = 0; - - for ( ; ( elem = this[ i ] ) != null; i++ ) { - if ( elem.nodeType === 1 ) { - - // Prevent memory leaks - jQuery.cleanData( getAll( elem, false ) ); - - // Remove any remaining nodes - elem.textContent = ""; - } - } - - return this; - }, - - clone: function( dataAndEvents, deepDataAndEvents ) { - dataAndEvents = dataAndEvents == null ? false : dataAndEvents; - deepDataAndEvents = deepDataAndEvents == null ? dataAndEvents : deepDataAndEvents; - - return this.map( function() { - return jQuery.clone( this, dataAndEvents, deepDataAndEvents ); - } ); - }, - - html: function( value ) { - return access( this, function( value ) { - var elem = this[ 0 ] || {}, - i = 0, - l = this.length; - - if ( value === undefined && elem.nodeType === 1 ) { - return elem.innerHTML; - } - - // See if we can take a shortcut and just use innerHTML - if ( typeof value === "string" && !rnoInnerhtml.test( value ) && - !wrapMap[ ( rtagName.exec( value ) || [ "", "" ] )[ 1 ].toLowerCase() ] ) { - - value = jQuery.htmlPrefilter( value ); - - try { - for ( ; i < l; i++ ) { - elem = this[ i ] || {}; - - // Remove element nodes and prevent memory leaks - if ( elem.nodeType === 1 ) { - jQuery.cleanData( getAll( elem, false ) ); - elem.innerHTML = value; - } - } - - elem = 0; - - // If using innerHTML throws an exception, use the fallback method - } catch ( e ) {} - } - - if ( elem ) { - this.empty().append( value ); - } - }, null, value, arguments.length ); - }, - - replaceWith: function() { - var ignored = []; - - // Make the changes, replacing each non-ignored context element with the new content - return domManip( this, arguments, function( elem ) { - var parent = this.parentNode; - - if ( jQuery.inArray( this, ignored ) < 0 ) { - jQuery.cleanData( getAll( this ) ); - if ( parent ) { - parent.replaceChild( elem, this ); - } - } - - // Force callback invocation - }, ignored ); - } -} ); - -jQuery.each( { - appendTo: "append", - prependTo: "prepend", - insertBefore: "before", - insertAfter: "after", - replaceAll: "replaceWith" -}, function( name, original ) { - jQuery.fn[ name ] = function( selector ) { - var elems, - ret = [], - insert = jQuery( selector ), - last = insert.length - 1, - i = 0; - - for ( ; i <= last; i++ ) { - elems = i === last ? this : this.clone( true ); - jQuery( insert[ i ] )[ original ]( elems ); - - // Support: Android <=4.0 only, PhantomJS 1 only - // .get() because push.apply(_, arraylike) throws on ancient WebKit - push.apply( ret, elems.get() ); - } - - return this.pushStack( ret ); - }; -} ); -var rnumnonpx = new RegExp( "^(" + pnum + ")(?!px)[a-z%]+$", "i" ); - -var getStyles = function( elem ) { - - // Support: IE <=11 only, Firefox <=30 (#15098, #14150) - // IE throws on elements created in popups - // FF meanwhile throws on frame elements through "defaultView.getComputedStyle" - var view = elem.ownerDocument.defaultView; - - if ( !view || !view.opener ) { - view = window; - } - - return view.getComputedStyle( elem ); - }; - -var swap = function( elem, options, callback ) { - var ret, name, - old = {}; - - // Remember the old values, and insert the new ones - for ( name in options ) { - old[ name ] = elem.style[ name ]; - elem.style[ name ] = options[ name ]; - } - - ret = callback.call( elem ); - - // Revert the old values - for ( name in options ) { - elem.style[ name ] = old[ name ]; - } - - return ret; -}; - - -var rboxStyle = new RegExp( cssExpand.join( "|" ), "i" ); - - - -( function() { - - // Executing both pixelPosition & boxSizingReliable tests require only one layout - // so they're executed at the same time to save the second computation. - function computeStyleTests() { - - // This is a singleton, we need to execute it only once - if ( !div ) { - return; - } - - container.style.cssText = "position:absolute;left:-11111px;width:60px;" + - "margin-top:1px;padding:0;border:0"; - div.style.cssText = - "position:relative;display:block;box-sizing:border-box;overflow:scroll;" + - "margin:auto;border:1px;padding:1px;" + - "width:60%;top:1%"; - documentElement.appendChild( container ).appendChild( div ); - - var divStyle = window.getComputedStyle( div ); - pixelPositionVal = divStyle.top !== "1%"; - - // Support: Android 4.0 - 4.3 only, Firefox <=3 - 44 - reliableMarginLeftVal = roundPixelMeasures( divStyle.marginLeft ) === 12; - - // Support: Android 4.0 - 4.3 only, Safari <=9.1 - 10.1, iOS <=7.0 - 9.3 - // Some styles come back with percentage values, even though they shouldn't - div.style.right = "60%"; - pixelBoxStylesVal = roundPixelMeasures( divStyle.right ) === 36; - - // Support: IE 9 - 11 only - // Detect misreporting of content dimensions for box-sizing:border-box elements - boxSizingReliableVal = roundPixelMeasures( divStyle.width ) === 36; - - // Support: IE 9 only - // Detect overflow:scroll screwiness (gh-3699) - // Support: Chrome <=64 - // Don't get tricked when zoom affects offsetWidth (gh-4029) - div.style.position = "absolute"; - scrollboxSizeVal = roundPixelMeasures( div.offsetWidth / 3 ) === 12; - - documentElement.removeChild( container ); - - // Nullify the div so it wouldn't be stored in the memory and - // it will also be a sign that checks already performed - div = null; - } - - function roundPixelMeasures( measure ) { - return Math.round( parseFloat( measure ) ); - } - - var pixelPositionVal, boxSizingReliableVal, scrollboxSizeVal, pixelBoxStylesVal, - reliableTrDimensionsVal, reliableMarginLeftVal, - container = document.createElement( "div" ), - div = document.createElement( "div" ); - - // Finish early in limited (non-browser) environments - if ( !div.style ) { - return; - } - - // Support: IE <=9 - 11 only - // Style of cloned element affects source element cloned (#8908) - div.style.backgroundClip = "content-box"; - div.cloneNode( true ).style.backgroundClip = ""; - support.clearCloneStyle = div.style.backgroundClip === "content-box"; - - jQuery.extend( support, { - boxSizingReliable: function() { - computeStyleTests(); - return boxSizingReliableVal; - }, - pixelBoxStyles: function() { - computeStyleTests(); - return pixelBoxStylesVal; - }, - pixelPosition: function() { - computeStyleTests(); - return pixelPositionVal; - }, - reliableMarginLeft: function() { - computeStyleTests(); - return reliableMarginLeftVal; - }, - scrollboxSize: function() { - computeStyleTests(); - return scrollboxSizeVal; - }, - - // Support: IE 9 - 11+, Edge 15 - 18+ - // IE/Edge misreport `getComputedStyle` of table rows with width/height - // set in CSS while `offset*` properties report correct values. - // Behavior in IE 9 is more subtle than in newer versions & it passes - // some versions of this test; make sure not to make it pass there! - reliableTrDimensions: function() { - var table, tr, trChild, trStyle; - if ( reliableTrDimensionsVal == null ) { - table = document.createElement( "table" ); - tr = document.createElement( "tr" ); - trChild = document.createElement( "div" ); - - table.style.cssText = "position:absolute;left:-11111px"; - tr.style.height = "1px"; - trChild.style.height = "9px"; - - documentElement - .appendChild( table ) - .appendChild( tr ) - .appendChild( trChild ); - - trStyle = window.getComputedStyle( tr ); - reliableTrDimensionsVal = parseInt( trStyle.height ) > 3; - - documentElement.removeChild( table ); - } - return reliableTrDimensionsVal; - } - } ); -} )(); - - -function curCSS( elem, name, computed ) { - var width, minWidth, maxWidth, ret, - - // Support: Firefox 51+ - // Retrieving style before computed somehow - // fixes an issue with getting wrong values - // on detached elements - style = elem.style; - - computed = computed || getStyles( elem ); - - // getPropertyValue is needed for: - // .css('filter') (IE 9 only, #12537) - // .css('--customProperty) (#3144) - if ( computed ) { - ret = computed.getPropertyValue( name ) || computed[ name ]; - - if ( ret === "" && !isAttached( elem ) ) { - ret = jQuery.style( elem, name ); - } - - // A tribute to the "awesome hack by Dean Edwards" - // Android Browser returns percentage for some values, - // but width seems to be reliably pixels. - // This is against the CSSOM draft spec: - // https://drafts.csswg.org/cssom/#resolved-values - if ( !support.pixelBoxStyles() && rnumnonpx.test( ret ) && rboxStyle.test( name ) ) { - - // Remember the original values - width = style.width; - minWidth = style.minWidth; - maxWidth = style.maxWidth; - - // Put in the new values to get a computed value out - style.minWidth = style.maxWidth = style.width = ret; - ret = computed.width; - - // Revert the changed values - style.width = width; - style.minWidth = minWidth; - style.maxWidth = maxWidth; - } - } - - return ret !== undefined ? - - // Support: IE <=9 - 11 only - // IE returns zIndex value as an integer. - ret + "" : - ret; -} - - -function addGetHookIf( conditionFn, hookFn ) { - - // Define the hook, we'll check on the first run if it's really needed. - return { - get: function() { - if ( conditionFn() ) { - - // Hook not needed (or it's not possible to use it due - // to missing dependency), remove it. - delete this.get; - return; - } - - // Hook needed; redefine it so that the support test is not executed again. - return ( this.get = hookFn ).apply( this, arguments ); - } - }; -} - - -var cssPrefixes = [ "Webkit", "Moz", "ms" ], - emptyStyle = document.createElement( "div" ).style, - vendorProps = {}; - -// Return a vendor-prefixed property or undefined -function vendorPropName( name ) { - - // Check for vendor prefixed names - var capName = name[ 0 ].toUpperCase() + name.slice( 1 ), - i = cssPrefixes.length; - - while ( i-- ) { - name = cssPrefixes[ i ] + capName; - if ( name in emptyStyle ) { - return name; - } - } -} - -// Return a potentially-mapped jQuery.cssProps or vendor prefixed property -function finalPropName( name ) { - var final = jQuery.cssProps[ name ] || vendorProps[ name ]; - - if ( final ) { - return final; - } - if ( name in emptyStyle ) { - return name; - } - return vendorProps[ name ] = vendorPropName( name ) || name; -} - - -var - - // Swappable if display is none or starts with table - // except "table", "table-cell", or "table-caption" - // See here for display values: https://developer.mozilla.org/en-US/docs/CSS/display - rdisplayswap = /^(none|table(?!-c[ea]).+)/, - rcustomProp = /^--/, - cssShow = { position: "absolute", visibility: "hidden", display: "block" }, - cssNormalTransform = { - letterSpacing: "0", - fontWeight: "400" - }; - -function setPositiveNumber( _elem, value, subtract ) { - - // Any relative (+/-) values have already been - // normalized at this point - var matches = rcssNum.exec( value ); - return matches ? - - // Guard against undefined "subtract", e.g., when used as in cssHooks - Math.max( 0, matches[ 2 ] - ( subtract || 0 ) ) + ( matches[ 3 ] || "px" ) : - value; -} - -function boxModelAdjustment( elem, dimension, box, isBorderBox, styles, computedVal ) { - var i = dimension === "width" ? 1 : 0, - extra = 0, - delta = 0; - - // Adjustment may not be necessary - if ( box === ( isBorderBox ? "border" : "content" ) ) { - return 0; - } - - for ( ; i < 4; i += 2 ) { - - // Both box models exclude margin - if ( box === "margin" ) { - delta += jQuery.css( elem, box + cssExpand[ i ], true, styles ); - } - - // If we get here with a content-box, we're seeking "padding" or "border" or "margin" - if ( !isBorderBox ) { - - // Add padding - delta += jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); - - // For "border" or "margin", add border - if ( box !== "padding" ) { - delta += jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); - - // But still keep track of it otherwise - } else { - extra += jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); - } - - // If we get here with a border-box (content + padding + border), we're seeking "content" or - // "padding" or "margin" - } else { - - // For "content", subtract padding - if ( box === "content" ) { - delta -= jQuery.css( elem, "padding" + cssExpand[ i ], true, styles ); - } - - // For "content" or "padding", subtract border - if ( box !== "margin" ) { - delta -= jQuery.css( elem, "border" + cssExpand[ i ] + "Width", true, styles ); - } - } - } - - // Account for positive content-box scroll gutter when requested by providing computedVal - if ( !isBorderBox && computedVal >= 0 ) { - - // offsetWidth/offsetHeight is a rounded sum of content, padding, scroll gutter, and border - // Assuming integer scroll gutter, subtract the rest and round down - delta += Math.max( 0, Math.ceil( - elem[ "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ) ] - - computedVal - - delta - - extra - - 0.5 - - // If offsetWidth/offsetHeight is unknown, then we can't determine content-box scroll gutter - // Use an explicit zero to avoid NaN (gh-3964) - ) ) || 0; - } - - return delta; -} - -function getWidthOrHeight( elem, dimension, extra ) { - - // Start with computed style - var styles = getStyles( elem ), - - // To avoid forcing a reflow, only fetch boxSizing if we need it (gh-4322). - // Fake content-box until we know it's needed to know the true value. - boxSizingNeeded = !support.boxSizingReliable() || extra, - isBorderBox = boxSizingNeeded && - jQuery.css( elem, "boxSizing", false, styles ) === "border-box", - valueIsBorderBox = isBorderBox, - - val = curCSS( elem, dimension, styles ), - offsetProp = "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ); - - // Support: Firefox <=54 - // Return a confounding non-pixel value or feign ignorance, as appropriate. - if ( rnumnonpx.test( val ) ) { - if ( !extra ) { - return val; - } - val = "auto"; - } - - - // Support: IE 9 - 11 only - // Use offsetWidth/offsetHeight for when box sizing is unreliable. - // In those cases, the computed value can be trusted to be border-box. - if ( ( !support.boxSizingReliable() && isBorderBox || - - // Support: IE 10 - 11+, Edge 15 - 18+ - // IE/Edge misreport `getComputedStyle` of table rows with width/height - // set in CSS while `offset*` properties report correct values. - // Interestingly, in some cases IE 9 doesn't suffer from this issue. - !support.reliableTrDimensions() && nodeName( elem, "tr" ) || - - // Fall back to offsetWidth/offsetHeight when value is "auto" - // This happens for inline elements with no explicit setting (gh-3571) - val === "auto" || - - // Support: Android <=4.1 - 4.3 only - // Also use offsetWidth/offsetHeight for misreported inline dimensions (gh-3602) - !parseFloat( val ) && jQuery.css( elem, "display", false, styles ) === "inline" ) && - - // Make sure the element is visible & connected - elem.getClientRects().length ) { - - isBorderBox = jQuery.css( elem, "boxSizing", false, styles ) === "border-box"; - - // Where available, offsetWidth/offsetHeight approximate border box dimensions. - // Where not available (e.g., SVG), assume unreliable box-sizing and interpret the - // retrieved value as a content box dimension. - valueIsBorderBox = offsetProp in elem; - if ( valueIsBorderBox ) { - val = elem[ offsetProp ]; - } - } - - // Normalize "" and auto - val = parseFloat( val ) || 0; - - // Adjust for the element's box model - return ( val + - boxModelAdjustment( - elem, - dimension, - extra || ( isBorderBox ? "border" : "content" ), - valueIsBorderBox, - styles, - - // Provide the current computed size to request scroll gutter calculation (gh-3589) - val - ) - ) + "px"; -} - -jQuery.extend( { - - // Add in style property hooks for overriding the default - // behavior of getting and setting a style property - cssHooks: { - opacity: { - get: function( elem, computed ) { - if ( computed ) { - - // We should always get a number back from opacity - var ret = curCSS( elem, "opacity" ); - return ret === "" ? "1" : ret; - } - } - } - }, - - // Don't automatically add "px" to these possibly-unitless properties - cssNumber: { - "animationIterationCount": true, - "columnCount": true, - "fillOpacity": true, - "flexGrow": true, - "flexShrink": true, - "fontWeight": true, - "gridArea": true, - "gridColumn": true, - "gridColumnEnd": true, - "gridColumnStart": true, - "gridRow": true, - "gridRowEnd": true, - "gridRowStart": true, - "lineHeight": true, - "opacity": true, - "order": true, - "orphans": true, - "widows": true, - "zIndex": true, - "zoom": true - }, - - // Add in properties whose names you wish to fix before - // setting or getting the value - cssProps: {}, - - // Get and set the style property on a DOM Node - style: function( elem, name, value, extra ) { - - // Don't set styles on text and comment nodes - if ( !elem || elem.nodeType === 3 || elem.nodeType === 8 || !elem.style ) { - return; - } - - // Make sure that we're working with the right name - var ret, type, hooks, - origName = camelCase( name ), - isCustomProp = rcustomProp.test( name ), - style = elem.style; - - // Make sure that we're working with the right name. We don't - // want to query the value if it is a CSS custom property - // since they are user-defined. - if ( !isCustomProp ) { - name = finalPropName( origName ); - } - - // Gets hook for the prefixed version, then unprefixed version - hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; - - // Check if we're setting a value - if ( value !== undefined ) { - type = typeof value; - - // Convert "+=" or "-=" to relative numbers (#7345) - if ( type === "string" && ( ret = rcssNum.exec( value ) ) && ret[ 1 ] ) { - value = adjustCSS( elem, name, ret ); - - // Fixes bug #9237 - type = "number"; - } - - // Make sure that null and NaN values aren't set (#7116) - if ( value == null || value !== value ) { - return; - } - - // If a number was passed in, add the unit (except for certain CSS properties) - // The isCustomProp check can be removed in jQuery 4.0 when we only auto-append - // "px" to a few hardcoded values. - if ( type === "number" && !isCustomProp ) { - value += ret && ret[ 3 ] || ( jQuery.cssNumber[ origName ] ? "" : "px" ); - } - - // background-* props affect original clone's values - if ( !support.clearCloneStyle && value === "" && name.indexOf( "background" ) === 0 ) { - style[ name ] = "inherit"; - } - - // If a hook was provided, use that value, otherwise just set the specified value - if ( !hooks || !( "set" in hooks ) || - ( value = hooks.set( elem, value, extra ) ) !== undefined ) { - - if ( isCustomProp ) { - style.setProperty( name, value ); - } else { - style[ name ] = value; - } - } - - } else { - - // If a hook was provided get the non-computed value from there - if ( hooks && "get" in hooks && - ( ret = hooks.get( elem, false, extra ) ) !== undefined ) { - - return ret; - } - - // Otherwise just get the value from the style object - return style[ name ]; - } - }, - - css: function( elem, name, extra, styles ) { - var val, num, hooks, - origName = camelCase( name ), - isCustomProp = rcustomProp.test( name ); - - // Make sure that we're working with the right name. We don't - // want to modify the value if it is a CSS custom property - // since they are user-defined. - if ( !isCustomProp ) { - name = finalPropName( origName ); - } - - // Try prefixed name followed by the unprefixed name - hooks = jQuery.cssHooks[ name ] || jQuery.cssHooks[ origName ]; - - // If a hook was provided get the computed value from there - if ( hooks && "get" in hooks ) { - val = hooks.get( elem, true, extra ); - } - - // Otherwise, if a way to get the computed value exists, use that - if ( val === undefined ) { - val = curCSS( elem, name, styles ); - } - - // Convert "normal" to computed value - if ( val === "normal" && name in cssNormalTransform ) { - val = cssNormalTransform[ name ]; - } - - // Make numeric if forced or a qualifier was provided and val looks numeric - if ( extra === "" || extra ) { - num = parseFloat( val ); - return extra === true || isFinite( num ) ? num || 0 : val; - } - - return val; - } -} ); - -jQuery.each( [ "height", "width" ], function( _i, dimension ) { - jQuery.cssHooks[ dimension ] = { - get: function( elem, computed, extra ) { - if ( computed ) { - - // Certain elements can have dimension info if we invisibly show them - // but it must have a current display style that would benefit - return rdisplayswap.test( jQuery.css( elem, "display" ) ) && - - // Support: Safari 8+ - // Table columns in Safari have non-zero offsetWidth & zero - // getBoundingClientRect().width unless display is changed. - // Support: IE <=11 only - // Running getBoundingClientRect on a disconnected node - // in IE throws an error. - ( !elem.getClientRects().length || !elem.getBoundingClientRect().width ) ? - swap( elem, cssShow, function() { - return getWidthOrHeight( elem, dimension, extra ); - } ) : - getWidthOrHeight( elem, dimension, extra ); - } - }, - - set: function( elem, value, extra ) { - var matches, - styles = getStyles( elem ), - - // Only read styles.position if the test has a chance to fail - // to avoid forcing a reflow. - scrollboxSizeBuggy = !support.scrollboxSize() && - styles.position === "absolute", - - // To avoid forcing a reflow, only fetch boxSizing if we need it (gh-3991) - boxSizingNeeded = scrollboxSizeBuggy || extra, - isBorderBox = boxSizingNeeded && - jQuery.css( elem, "boxSizing", false, styles ) === "border-box", - subtract = extra ? - boxModelAdjustment( - elem, - dimension, - extra, - isBorderBox, - styles - ) : - 0; - - // Account for unreliable border-box dimensions by comparing offset* to computed and - // faking a content-box to get border and padding (gh-3699) - if ( isBorderBox && scrollboxSizeBuggy ) { - subtract -= Math.ceil( - elem[ "offset" + dimension[ 0 ].toUpperCase() + dimension.slice( 1 ) ] - - parseFloat( styles[ dimension ] ) - - boxModelAdjustment( elem, dimension, "border", false, styles ) - - 0.5 - ); - } - - // Convert to pixels if value adjustment is needed - if ( subtract && ( matches = rcssNum.exec( value ) ) && - ( matches[ 3 ] || "px" ) !== "px" ) { - - elem.style[ dimension ] = value; - value = jQuery.css( elem, dimension ); - } - - return setPositiveNumber( elem, value, subtract ); - } - }; -} ); - -jQuery.cssHooks.marginLeft = addGetHookIf( support.reliableMarginLeft, - function( elem, computed ) { - if ( computed ) { - return ( parseFloat( curCSS( elem, "marginLeft" ) ) || - elem.getBoundingClientRect().left - - swap( elem, { marginLeft: 0 }, function() { - return elem.getBoundingClientRect().left; - } ) - ) + "px"; - } - } -); - -// These hooks are used by animate to expand properties -jQuery.each( { - margin: "", - padding: "", - border: "Width" -}, function( prefix, suffix ) { - jQuery.cssHooks[ prefix + suffix ] = { - expand: function( value ) { - var i = 0, - expanded = {}, - - // Assumes a single number if not a string - parts = typeof value === "string" ? value.split( " " ) : [ value ]; - - for ( ; i < 4; i++ ) { - expanded[ prefix + cssExpand[ i ] + suffix ] = - parts[ i ] || parts[ i - 2 ] || parts[ 0 ]; - } - - return expanded; - } - }; - - if ( prefix !== "margin" ) { - jQuery.cssHooks[ prefix + suffix ].set = setPositiveNumber; - } -} ); - -jQuery.fn.extend( { - css: function( name, value ) { - return access( this, function( elem, name, value ) { - var styles, len, - map = {}, - i = 0; - - if ( Array.isArray( name ) ) { - styles = getStyles( elem ); - len = name.length; - - for ( ; i < len; i++ ) { - map[ name[ i ] ] = jQuery.css( elem, name[ i ], false, styles ); - } - - return map; - } - - return value !== undefined ? - jQuery.style( elem, name, value ) : - jQuery.css( elem, name ); - }, name, value, arguments.length > 1 ); - } -} ); - - -function Tween( elem, options, prop, end, easing ) { - return new Tween.prototype.init( elem, options, prop, end, easing ); -} -jQuery.Tween = Tween; - -Tween.prototype = { - constructor: Tween, - init: function( elem, options, prop, end, easing, unit ) { - this.elem = elem; - this.prop = prop; - this.easing = easing || jQuery.easing._default; - this.options = options; - this.start = this.now = this.cur(); - this.end = end; - this.unit = unit || ( jQuery.cssNumber[ prop ] ? "" : "px" ); - }, - cur: function() { - var hooks = Tween.propHooks[ this.prop ]; - - return hooks && hooks.get ? - hooks.get( this ) : - Tween.propHooks._default.get( this ); - }, - run: function( percent ) { - var eased, - hooks = Tween.propHooks[ this.prop ]; - - if ( this.options.duration ) { - this.pos = eased = jQuery.easing[ this.easing ]( - percent, this.options.duration * percent, 0, 1, this.options.duration - ); - } else { - this.pos = eased = percent; - } - this.now = ( this.end - this.start ) * eased + this.start; - - if ( this.options.step ) { - this.options.step.call( this.elem, this.now, this ); - } - - if ( hooks && hooks.set ) { - hooks.set( this ); - } else { - Tween.propHooks._default.set( this ); - } - return this; - } -}; - -Tween.prototype.init.prototype = Tween.prototype; - -Tween.propHooks = { - _default: { - get: function( tween ) { - var result; - - // Use a property on the element directly when it is not a DOM element, - // or when there is no matching style property that exists. - if ( tween.elem.nodeType !== 1 || - tween.elem[ tween.prop ] != null && tween.elem.style[ tween.prop ] == null ) { - return tween.elem[ tween.prop ]; - } - - // Passing an empty string as a 3rd parameter to .css will automatically - // attempt a parseFloat and fallback to a string if the parse fails. - // Simple values such as "10px" are parsed to Float; - // complex values such as "rotate(1rad)" are returned as-is. - result = jQuery.css( tween.elem, tween.prop, "" ); - - // Empty strings, null, undefined and "auto" are converted to 0. - return !result || result === "auto" ? 0 : result; - }, - set: function( tween ) { - - // Use step hook for back compat. - // Use cssHook if its there. - // Use .style if available and use plain properties where available. - if ( jQuery.fx.step[ tween.prop ] ) { - jQuery.fx.step[ tween.prop ]( tween ); - } else if ( tween.elem.nodeType === 1 && ( - jQuery.cssHooks[ tween.prop ] || - tween.elem.style[ finalPropName( tween.prop ) ] != null ) ) { - jQuery.style( tween.elem, tween.prop, tween.now + tween.unit ); - } else { - tween.elem[ tween.prop ] = tween.now; - } - } - } -}; - -// Support: IE <=9 only -// Panic based approach to setting things on disconnected nodes -Tween.propHooks.scrollTop = Tween.propHooks.scrollLeft = { - set: function( tween ) { - if ( tween.elem.nodeType && tween.elem.parentNode ) { - tween.elem[ tween.prop ] = tween.now; - } - } -}; - -jQuery.easing = { - linear: function( p ) { - return p; - }, - swing: function( p ) { - return 0.5 - Math.cos( p * Math.PI ) / 2; - }, - _default: "swing" -}; - -jQuery.fx = Tween.prototype.init; - -// Back compat <1.8 extension point -jQuery.fx.step = {}; - - - - -var - fxNow, inProgress, - rfxtypes = /^(?:toggle|show|hide)$/, - rrun = /queueHooks$/; - -function schedule() { - if ( inProgress ) { - if ( document.hidden === false && window.requestAnimationFrame ) { - window.requestAnimationFrame( schedule ); - } else { - window.setTimeout( schedule, jQuery.fx.interval ); - } - - jQuery.fx.tick(); - } -} - -// Animations created synchronously will run synchronously -function createFxNow() { - window.setTimeout( function() { - fxNow = undefined; - } ); - return ( fxNow = Date.now() ); -} - -// Generate parameters to create a standard animation -function genFx( type, includeWidth ) { - var which, - i = 0, - attrs = { height: type }; - - // If we include width, step value is 1 to do all cssExpand values, - // otherwise step value is 2 to skip over Left and Right - includeWidth = includeWidth ? 1 : 0; - for ( ; i < 4; i += 2 - includeWidth ) { - which = cssExpand[ i ]; - attrs[ "margin" + which ] = attrs[ "padding" + which ] = type; - } - - if ( includeWidth ) { - attrs.opacity = attrs.width = type; - } - - return attrs; -} - -function createTween( value, prop, animation ) { - var tween, - collection = ( Animation.tweeners[ prop ] || [] ).concat( Animation.tweeners[ "*" ] ), - index = 0, - length = collection.length; - for ( ; index < length; index++ ) { - if ( ( tween = collection[ index ].call( animation, prop, value ) ) ) { - - // We're done with this property - return tween; - } - } -} - -function defaultPrefilter( elem, props, opts ) { - var prop, value, toggle, hooks, oldfire, propTween, restoreDisplay, display, - isBox = "width" in props || "height" in props, - anim = this, - orig = {}, - style = elem.style, - hidden = elem.nodeType && isHiddenWithinTree( elem ), - dataShow = dataPriv.get( elem, "fxshow" ); - - // Queue-skipping animations hijack the fx hooks - if ( !opts.queue ) { - hooks = jQuery._queueHooks( elem, "fx" ); - if ( hooks.unqueued == null ) { - hooks.unqueued = 0; - oldfire = hooks.empty.fire; - hooks.empty.fire = function() { - if ( !hooks.unqueued ) { - oldfire(); - } - }; - } - hooks.unqueued++; - - anim.always( function() { - - // Ensure the complete handler is called before this completes - anim.always( function() { - hooks.unqueued--; - if ( !jQuery.queue( elem, "fx" ).length ) { - hooks.empty.fire(); - } - } ); - } ); - } - - // Detect show/hide animations - for ( prop in props ) { - value = props[ prop ]; - if ( rfxtypes.test( value ) ) { - delete props[ prop ]; - toggle = toggle || value === "toggle"; - if ( value === ( hidden ? "hide" : "show" ) ) { - - // Pretend to be hidden if this is a "show" and - // there is still data from a stopped show/hide - if ( value === "show" && dataShow && dataShow[ prop ] !== undefined ) { - hidden = true; - - // Ignore all other no-op show/hide data - } else { - continue; - } - } - orig[ prop ] = dataShow && dataShow[ prop ] || jQuery.style( elem, prop ); - } - } - - // Bail out if this is a no-op like .hide().hide() - propTween = !jQuery.isEmptyObject( props ); - if ( !propTween && jQuery.isEmptyObject( orig ) ) { - return; - } - - // Restrict "overflow" and "display" styles during box animations - if ( isBox && elem.nodeType === 1 ) { - - // Support: IE <=9 - 11, Edge 12 - 15 - // Record all 3 overflow attributes because IE does not infer the shorthand - // from identically-valued overflowX and overflowY and Edge just mirrors - // the overflowX value there. - opts.overflow = [ style.overflow, style.overflowX, style.overflowY ]; - - // Identify a display type, preferring old show/hide data over the CSS cascade - restoreDisplay = dataShow && dataShow.display; - if ( restoreDisplay == null ) { - restoreDisplay = dataPriv.get( elem, "display" ); - } - display = jQuery.css( elem, "display" ); - if ( display === "none" ) { - if ( restoreDisplay ) { - display = restoreDisplay; - } else { - - // Get nonempty value(s) by temporarily forcing visibility - showHide( [ elem ], true ); - restoreDisplay = elem.style.display || restoreDisplay; - display = jQuery.css( elem, "display" ); - showHide( [ elem ] ); - } - } - - // Animate inline elements as inline-block - if ( display === "inline" || display === "inline-block" && restoreDisplay != null ) { - if ( jQuery.css( elem, "float" ) === "none" ) { - - // Restore the original display value at the end of pure show/hide animations - if ( !propTween ) { - anim.done( function() { - style.display = restoreDisplay; - } ); - if ( restoreDisplay == null ) { - display = style.display; - restoreDisplay = display === "none" ? "" : display; - } - } - style.display = "inline-block"; - } - } - } - - if ( opts.overflow ) { - style.overflow = "hidden"; - anim.always( function() { - style.overflow = opts.overflow[ 0 ]; - style.overflowX = opts.overflow[ 1 ]; - style.overflowY = opts.overflow[ 2 ]; - } ); - } - - // Implement show/hide animations - propTween = false; - for ( prop in orig ) { - - // General show/hide setup for this element animation - if ( !propTween ) { - if ( dataShow ) { - if ( "hidden" in dataShow ) { - hidden = dataShow.hidden; - } - } else { - dataShow = dataPriv.access( elem, "fxshow", { display: restoreDisplay } ); - } - - // Store hidden/visible for toggle so `.stop().toggle()` "reverses" - if ( toggle ) { - dataShow.hidden = !hidden; - } - - // Show elements before animating them - if ( hidden ) { - showHide( [ elem ], true ); - } - - /* eslint-disable no-loop-func */ - - anim.done( function() { - - /* eslint-enable no-loop-func */ - - // The final step of a "hide" animation is actually hiding the element - if ( !hidden ) { - showHide( [ elem ] ); - } - dataPriv.remove( elem, "fxshow" ); - for ( prop in orig ) { - jQuery.style( elem, prop, orig[ prop ] ); - } - } ); - } - - // Per-property setup - propTween = createTween( hidden ? dataShow[ prop ] : 0, prop, anim ); - if ( !( prop in dataShow ) ) { - dataShow[ prop ] = propTween.start; - if ( hidden ) { - propTween.end = propTween.start; - propTween.start = 0; - } - } - } -} - -function propFilter( props, specialEasing ) { - var index, name, easing, value, hooks; - - // camelCase, specialEasing and expand cssHook pass - for ( index in props ) { - name = camelCase( index ); - easing = specialEasing[ name ]; - value = props[ index ]; - if ( Array.isArray( value ) ) { - easing = value[ 1 ]; - value = props[ index ] = value[ 0 ]; - } - - if ( index !== name ) { - props[ name ] = value; - delete props[ index ]; - } - - hooks = jQuery.cssHooks[ name ]; - if ( hooks && "expand" in hooks ) { - value = hooks.expand( value ); - delete props[ name ]; - - // Not quite $.extend, this won't overwrite existing keys. - // Reusing 'index' because we have the correct "name" - for ( index in value ) { - if ( !( index in props ) ) { - props[ index ] = value[ index ]; - specialEasing[ index ] = easing; - } - } - } else { - specialEasing[ name ] = easing; - } - } -} - -function Animation( elem, properties, options ) { - var result, - stopped, - index = 0, - length = Animation.prefilters.length, - deferred = jQuery.Deferred().always( function() { - - // Don't match elem in the :animated selector - delete tick.elem; - } ), - tick = function() { - if ( stopped ) { - return false; - } - var currentTime = fxNow || createFxNow(), - remaining = Math.max( 0, animation.startTime + animation.duration - currentTime ), - - // Support: Android 2.3 only - // Archaic crash bug won't allow us to use `1 - ( 0.5 || 0 )` (#12497) - temp = remaining / animation.duration || 0, - percent = 1 - temp, - index = 0, - length = animation.tweens.length; - - for ( ; index < length; index++ ) { - animation.tweens[ index ].run( percent ); - } - - deferred.notifyWith( elem, [ animation, percent, remaining ] ); - - // If there's more to do, yield - if ( percent < 1 && length ) { - return remaining; - } - - // If this was an empty animation, synthesize a final progress notification - if ( !length ) { - deferred.notifyWith( elem, [ animation, 1, 0 ] ); - } - - // Resolve the animation and report its conclusion - deferred.resolveWith( elem, [ animation ] ); - return false; - }, - animation = deferred.promise( { - elem: elem, - props: jQuery.extend( {}, properties ), - opts: jQuery.extend( true, { - specialEasing: {}, - easing: jQuery.easing._default - }, options ), - originalProperties: properties, - originalOptions: options, - startTime: fxNow || createFxNow(), - duration: options.duration, - tweens: [], - createTween: function( prop, end ) { - var tween = jQuery.Tween( elem, animation.opts, prop, end, - animation.opts.specialEasing[ prop ] || animation.opts.easing ); - animation.tweens.push( tween ); - return tween; - }, - stop: function( gotoEnd ) { - var index = 0, - - // If we are going to the end, we want to run all the tweens - // otherwise we skip this part - length = gotoEnd ? animation.tweens.length : 0; - if ( stopped ) { - return this; - } - stopped = true; - for ( ; index < length; index++ ) { - animation.tweens[ index ].run( 1 ); - } - - // Resolve when we played the last frame; otherwise, reject - if ( gotoEnd ) { - deferred.notifyWith( elem, [ animation, 1, 0 ] ); - deferred.resolveWith( elem, [ animation, gotoEnd ] ); - } else { - deferred.rejectWith( elem, [ animation, gotoEnd ] ); - } - return this; - } - } ), - props = animation.props; - - propFilter( props, animation.opts.specialEasing ); - - for ( ; index < length; index++ ) { - result = Animation.prefilters[ index ].call( animation, elem, props, animation.opts ); - if ( result ) { - if ( isFunction( result.stop ) ) { - jQuery._queueHooks( animation.elem, animation.opts.queue ).stop = - result.stop.bind( result ); - } - return result; - } - } - - jQuery.map( props, createTween, animation ); - - if ( isFunction( animation.opts.start ) ) { - animation.opts.start.call( elem, animation ); - } - - // Attach callbacks from options - animation - .progress( animation.opts.progress ) - .done( animation.opts.done, animation.opts.complete ) - .fail( animation.opts.fail ) - .always( animation.opts.always ); - - jQuery.fx.timer( - jQuery.extend( tick, { - elem: elem, - anim: animation, - queue: animation.opts.queue - } ) - ); - - return animation; -} - -jQuery.Animation = jQuery.extend( Animation, { - - tweeners: { - "*": [ function( prop, value ) { - var tween = this.createTween( prop, value ); - adjustCSS( tween.elem, prop, rcssNum.exec( value ), tween ); - return tween; - } ] - }, - - tweener: function( props, callback ) { - if ( isFunction( props ) ) { - callback = props; - props = [ "*" ]; - } else { - props = props.match( rnothtmlwhite ); - } - - var prop, - index = 0, - length = props.length; - - for ( ; index < length; index++ ) { - prop = props[ index ]; - Animation.tweeners[ prop ] = Animation.tweeners[ prop ] || []; - Animation.tweeners[ prop ].unshift( callback ); - } - }, - - prefilters: [ defaultPrefilter ], - - prefilter: function( callback, prepend ) { - if ( prepend ) { - Animation.prefilters.unshift( callback ); - } else { - Animation.prefilters.push( callback ); - } - } -} ); - -jQuery.speed = function( speed, easing, fn ) { - var opt = speed && typeof speed === "object" ? jQuery.extend( {}, speed ) : { - complete: fn || !fn && easing || - isFunction( speed ) && speed, - duration: speed, - easing: fn && easing || easing && !isFunction( easing ) && easing - }; - - // Go to the end state if fx are off - if ( jQuery.fx.off ) { - opt.duration = 0; - - } else { - if ( typeof opt.duration !== "number" ) { - if ( opt.duration in jQuery.fx.speeds ) { - opt.duration = jQuery.fx.speeds[ opt.duration ]; - - } else { - opt.duration = jQuery.fx.speeds._default; - } - } - } - - // Normalize opt.queue - true/undefined/null -> "fx" - if ( opt.queue == null || opt.queue === true ) { - opt.queue = "fx"; - } - - // Queueing - opt.old = opt.complete; - - opt.complete = function() { - if ( isFunction( opt.old ) ) { - opt.old.call( this ); - } - - if ( opt.queue ) { - jQuery.dequeue( this, opt.queue ); - } - }; - - return opt; -}; - -jQuery.fn.extend( { - fadeTo: function( speed, to, easing, callback ) { - - // Show any hidden elements after setting opacity to 0 - return this.filter( isHiddenWithinTree ).css( "opacity", 0 ).show() - - // Animate to the value specified - .end().animate( { opacity: to }, speed, easing, callback ); - }, - animate: function( prop, speed, easing, callback ) { - var empty = jQuery.isEmptyObject( prop ), - optall = jQuery.speed( speed, easing, callback ), - doAnimation = function() { - - // Operate on a copy of prop so per-property easing won't be lost - var anim = Animation( this, jQuery.extend( {}, prop ), optall ); - - // Empty animations, or finishing resolves immediately - if ( empty || dataPriv.get( this, "finish" ) ) { - anim.stop( true ); - } - }; - doAnimation.finish = doAnimation; - - return empty || optall.queue === false ? - this.each( doAnimation ) : - this.queue( optall.queue, doAnimation ); - }, - stop: function( type, clearQueue, gotoEnd ) { - var stopQueue = function( hooks ) { - var stop = hooks.stop; - delete hooks.stop; - stop( gotoEnd ); - }; - - if ( typeof type !== "string" ) { - gotoEnd = clearQueue; - clearQueue = type; - type = undefined; - } - if ( clearQueue ) { - this.queue( type || "fx", [] ); - } - - return this.each( function() { - var dequeue = true, - index = type != null && type + "queueHooks", - timers = jQuery.timers, - data = dataPriv.get( this ); - - if ( index ) { - if ( data[ index ] && data[ index ].stop ) { - stopQueue( data[ index ] ); - } - } else { - for ( index in data ) { - if ( data[ index ] && data[ index ].stop && rrun.test( index ) ) { - stopQueue( data[ index ] ); - } - } - } - - for ( index = timers.length; index--; ) { - if ( timers[ index ].elem === this && - ( type == null || timers[ index ].queue === type ) ) { - - timers[ index ].anim.stop( gotoEnd ); - dequeue = false; - timers.splice( index, 1 ); - } - } - - // Start the next in the queue if the last step wasn't forced. - // Timers currently will call their complete callbacks, which - // will dequeue but only if they were gotoEnd. - if ( dequeue || !gotoEnd ) { - jQuery.dequeue( this, type ); - } - } ); - }, - finish: function( type ) { - if ( type !== false ) { - type = type || "fx"; - } - return this.each( function() { - var index, - data = dataPriv.get( this ), - queue = data[ type + "queue" ], - hooks = data[ type + "queueHooks" ], - timers = jQuery.timers, - length = queue ? queue.length : 0; - - // Enable finishing flag on private data - data.finish = true; - - // Empty the queue first - jQuery.queue( this, type, [] ); - - if ( hooks && hooks.stop ) { - hooks.stop.call( this, true ); - } - - // Look for any active animations, and finish them - for ( index = timers.length; index--; ) { - if ( timers[ index ].elem === this && timers[ index ].queue === type ) { - timers[ index ].anim.stop( true ); - timers.splice( index, 1 ); - } - } - - // Look for any animations in the old queue and finish them - for ( index = 0; index < length; index++ ) { - if ( queue[ index ] && queue[ index ].finish ) { - queue[ index ].finish.call( this ); - } - } - - // Turn off finishing flag - delete data.finish; - } ); - } -} ); - -jQuery.each( [ "toggle", "show", "hide" ], function( _i, name ) { - var cssFn = jQuery.fn[ name ]; - jQuery.fn[ name ] = function( speed, easing, callback ) { - return speed == null || typeof speed === "boolean" ? - cssFn.apply( this, arguments ) : - this.animate( genFx( name, true ), speed, easing, callback ); - }; -} ); - -// Generate shortcuts for custom animations -jQuery.each( { - slideDown: genFx( "show" ), - slideUp: genFx( "hide" ), - slideToggle: genFx( "toggle" ), - fadeIn: { opacity: "show" }, - fadeOut: { opacity: "hide" }, - fadeToggle: { opacity: "toggle" } -}, function( name, props ) { - jQuery.fn[ name ] = function( speed, easing, callback ) { - return this.animate( props, speed, easing, callback ); - }; -} ); - -jQuery.timers = []; -jQuery.fx.tick = function() { - var timer, - i = 0, - timers = jQuery.timers; - - fxNow = Date.now(); - - for ( ; i < timers.length; i++ ) { - timer = timers[ i ]; - - // Run the timer and safely remove it when done (allowing for external removal) - if ( !timer() && timers[ i ] === timer ) { - timers.splice( i--, 1 ); - } - } - - if ( !timers.length ) { - jQuery.fx.stop(); - } - fxNow = undefined; -}; - -jQuery.fx.timer = function( timer ) { - jQuery.timers.push( timer ); - jQuery.fx.start(); -}; - -jQuery.fx.interval = 13; -jQuery.fx.start = function() { - if ( inProgress ) { - return; - } - - inProgress = true; - schedule(); -}; - -jQuery.fx.stop = function() { - inProgress = null; -}; - -jQuery.fx.speeds = { - slow: 600, - fast: 200, - - // Default speed - _default: 400 -}; - - -// Based off of the plugin by Clint Helfers, with permission. -// https://web.archive.org/web/20100324014747/http://blindsignals.com/index.php/2009/07/jquery-delay/ -jQuery.fn.delay = function( time, type ) { - time = jQuery.fx ? jQuery.fx.speeds[ time ] || time : time; - type = type || "fx"; - - return this.queue( type, function( next, hooks ) { - var timeout = window.setTimeout( next, time ); - hooks.stop = function() { - window.clearTimeout( timeout ); - }; - } ); -}; - - -( function() { - var input = document.createElement( "input" ), - select = document.createElement( "select" ), - opt = select.appendChild( document.createElement( "option" ) ); - - input.type = "checkbox"; - - // Support: Android <=4.3 only - // Default value for a checkbox should be "on" - support.checkOn = input.value !== ""; - - // Support: IE <=11 only - // Must access selectedIndex to make default options select - support.optSelected = opt.selected; - - // Support: IE <=11 only - // An input loses its value after becoming a radio - input = document.createElement( "input" ); - input.value = "t"; - input.type = "radio"; - support.radioValue = input.value === "t"; -} )(); - - -var boolHook, - attrHandle = jQuery.expr.attrHandle; - -jQuery.fn.extend( { - attr: function( name, value ) { - return access( this, jQuery.attr, name, value, arguments.length > 1 ); - }, - - removeAttr: function( name ) { - return this.each( function() { - jQuery.removeAttr( this, name ); - } ); - } -} ); - -jQuery.extend( { - attr: function( elem, name, value ) { - var ret, hooks, - nType = elem.nodeType; - - // Don't get/set attributes on text, comment and attribute nodes - if ( nType === 3 || nType === 8 || nType === 2 ) { - return; - } - - // Fallback to prop when attributes are not supported - if ( typeof elem.getAttribute === "undefined" ) { - return jQuery.prop( elem, name, value ); - } - - // Attribute hooks are determined by the lowercase version - // Grab necessary hook if one is defined - if ( nType !== 1 || !jQuery.isXMLDoc( elem ) ) { - hooks = jQuery.attrHooks[ name.toLowerCase() ] || - ( jQuery.expr.match.bool.test( name ) ? boolHook : undefined ); - } - - if ( value !== undefined ) { - if ( value === null ) { - jQuery.removeAttr( elem, name ); - return; - } - - if ( hooks && "set" in hooks && - ( ret = hooks.set( elem, value, name ) ) !== undefined ) { - return ret; - } - - elem.setAttribute( name, value + "" ); - return value; - } - - if ( hooks && "get" in hooks && ( ret = hooks.get( elem, name ) ) !== null ) { - return ret; - } - - ret = jQuery.find.attr( elem, name ); - - // Non-existent attributes return null, we normalize to undefined - return ret == null ? undefined : ret; - }, - - attrHooks: { - type: { - set: function( elem, value ) { - if ( !support.radioValue && value === "radio" && - nodeName( elem, "input" ) ) { - var val = elem.value; - elem.setAttribute( "type", value ); - if ( val ) { - elem.value = val; - } - return value; - } - } - } - }, - - removeAttr: function( elem, value ) { - var name, - i = 0, - - // Attribute names can contain non-HTML whitespace characters - // https://html.spec.whatwg.org/multipage/syntax.html#attributes-2 - attrNames = value && value.match( rnothtmlwhite ); - - if ( attrNames && elem.nodeType === 1 ) { - while ( ( name = attrNames[ i++ ] ) ) { - elem.removeAttribute( name ); - } - } - } -} ); - -// Hooks for boolean attributes -boolHook = { - set: function( elem, value, name ) { - if ( value === false ) { - - // Remove boolean attributes when set to false - jQuery.removeAttr( elem, name ); - } else { - elem.setAttribute( name, name ); - } - return name; - } -}; - -jQuery.each( jQuery.expr.match.bool.source.match( /\w+/g ), function( _i, name ) { - var getter = attrHandle[ name ] || jQuery.find.attr; - - attrHandle[ name ] = function( elem, name, isXML ) { - var ret, handle, - lowercaseName = name.toLowerCase(); - - if ( !isXML ) { - - // Avoid an infinite loop by temporarily removing this function from the getter - handle = attrHandle[ lowercaseName ]; - attrHandle[ lowercaseName ] = ret; - ret = getter( elem, name, isXML ) != null ? - lowercaseName : - null; - attrHandle[ lowercaseName ] = handle; - } - return ret; - }; -} ); - - - - -var rfocusable = /^(?:input|select|textarea|button)$/i, - rclickable = /^(?:a|area)$/i; - -jQuery.fn.extend( { - prop: function( name, value ) { - return access( this, jQuery.prop, name, value, arguments.length > 1 ); - }, - - removeProp: function( name ) { - return this.each( function() { - delete this[ jQuery.propFix[ name ] || name ]; - } ); - } -} ); - -jQuery.extend( { - prop: function( elem, name, value ) { - var ret, hooks, - nType = elem.nodeType; - - // Don't get/set properties on text, comment and attribute nodes - if ( nType === 3 || nType === 8 || nType === 2 ) { - return; - } - - if ( nType !== 1 || !jQuery.isXMLDoc( elem ) ) { - - // Fix name and attach hooks - name = jQuery.propFix[ name ] || name; - hooks = jQuery.propHooks[ name ]; - } - - if ( value !== undefined ) { - if ( hooks && "set" in hooks && - ( ret = hooks.set( elem, value, name ) ) !== undefined ) { - return ret; - } - - return ( elem[ name ] = value ); - } - - if ( hooks && "get" in hooks && ( ret = hooks.get( elem, name ) ) !== null ) { - return ret; - } - - return elem[ name ]; - }, - - propHooks: { - tabIndex: { - get: function( elem ) { - - // Support: IE <=9 - 11 only - // elem.tabIndex doesn't always return the - // correct value when it hasn't been explicitly set - // https://web.archive.org/web/20141116233347/http://fluidproject.org/blog/2008/01/09/getting-setting-and-removing-tabindex-values-with-javascript/ - // Use proper attribute retrieval(#12072) - var tabindex = jQuery.find.attr( elem, "tabindex" ); - - if ( tabindex ) { - return parseInt( tabindex, 10 ); - } - - if ( - rfocusable.test( elem.nodeName ) || - rclickable.test( elem.nodeName ) && - elem.href - ) { - return 0; - } - - return -1; - } - } - }, - - propFix: { - "for": "htmlFor", - "class": "className" - } -} ); - -// Support: IE <=11 only -// Accessing the selectedIndex property -// forces the browser to respect setting selected -// on the option -// The getter ensures a default option is selected -// when in an optgroup -// eslint rule "no-unused-expressions" is disabled for this code -// since it considers such accessions noop -if ( !support.optSelected ) { - jQuery.propHooks.selected = { - get: function( elem ) { - - /* eslint no-unused-expressions: "off" */ - - var parent = elem.parentNode; - if ( parent && parent.parentNode ) { - parent.parentNode.selectedIndex; - } - return null; - }, - set: function( elem ) { - - /* eslint no-unused-expressions: "off" */ - - var parent = elem.parentNode; - if ( parent ) { - parent.selectedIndex; - - if ( parent.parentNode ) { - parent.parentNode.selectedIndex; - } - } - } - }; -} - -jQuery.each( [ - "tabIndex", - "readOnly", - "maxLength", - "cellSpacing", - "cellPadding", - "rowSpan", - "colSpan", - "useMap", - "frameBorder", - "contentEditable" -], function() { - jQuery.propFix[ this.toLowerCase() ] = this; -} ); - - - - - // Strip and collapse whitespace according to HTML spec - // https://infra.spec.whatwg.org/#strip-and-collapse-ascii-whitespace - function stripAndCollapse( value ) { - var tokens = value.match( rnothtmlwhite ) || []; - return tokens.join( " " ); - } - - -function getClass( elem ) { - return elem.getAttribute && elem.getAttribute( "class" ) || ""; -} - -function classesToArray( value ) { - if ( Array.isArray( value ) ) { - return value; - } - if ( typeof value === "string" ) { - return value.match( rnothtmlwhite ) || []; - } - return []; -} - -jQuery.fn.extend( { - addClass: function( value ) { - var classes, elem, cur, curValue, clazz, j, finalValue, - i = 0; - - if ( isFunction( value ) ) { - return this.each( function( j ) { - jQuery( this ).addClass( value.call( this, j, getClass( this ) ) ); - } ); - } - - classes = classesToArray( value ); - - if ( classes.length ) { - while ( ( elem = this[ i++ ] ) ) { - curValue = getClass( elem ); - cur = elem.nodeType === 1 && ( " " + stripAndCollapse( curValue ) + " " ); - - if ( cur ) { - j = 0; - while ( ( clazz = classes[ j++ ] ) ) { - if ( cur.indexOf( " " + clazz + " " ) < 0 ) { - cur += clazz + " "; - } - } - - // Only assign if different to avoid unneeded rendering. - finalValue = stripAndCollapse( cur ); - if ( curValue !== finalValue ) { - elem.setAttribute( "class", finalValue ); - } - } - } - } - - return this; - }, - - removeClass: function( value ) { - var classes, elem, cur, curValue, clazz, j, finalValue, - i = 0; - - if ( isFunction( value ) ) { - return this.each( function( j ) { - jQuery( this ).removeClass( value.call( this, j, getClass( this ) ) ); - } ); - } - - if ( !arguments.length ) { - return this.attr( "class", "" ); - } - - classes = classesToArray( value ); - - if ( classes.length ) { - while ( ( elem = this[ i++ ] ) ) { - curValue = getClass( elem ); - - // This expression is here for better compressibility (see addClass) - cur = elem.nodeType === 1 && ( " " + stripAndCollapse( curValue ) + " " ); - - if ( cur ) { - j = 0; - while ( ( clazz = classes[ j++ ] ) ) { - - // Remove *all* instances - while ( cur.indexOf( " " + clazz + " " ) > -1 ) { - cur = cur.replace( " " + clazz + " ", " " ); - } - } - - // Only assign if different to avoid unneeded rendering. - finalValue = stripAndCollapse( cur ); - if ( curValue !== finalValue ) { - elem.setAttribute( "class", finalValue ); - } - } - } - } - - return this; - }, - - toggleClass: function( value, stateVal ) { - var type = typeof value, - isValidValue = type === "string" || Array.isArray( value ); - - if ( typeof stateVal === "boolean" && isValidValue ) { - return stateVal ? this.addClass( value ) : this.removeClass( value ); - } - - if ( isFunction( value ) ) { - return this.each( function( i ) { - jQuery( this ).toggleClass( - value.call( this, i, getClass( this ), stateVal ), - stateVal - ); - } ); - } - - return this.each( function() { - var className, i, self, classNames; - - if ( isValidValue ) { - - // Toggle individual class names - i = 0; - self = jQuery( this ); - classNames = classesToArray( value ); - - while ( ( className = classNames[ i++ ] ) ) { - - // Check each className given, space separated list - if ( self.hasClass( className ) ) { - self.removeClass( className ); - } else { - self.addClass( className ); - } - } - - // Toggle whole class name - } else if ( value === undefined || type === "boolean" ) { - className = getClass( this ); - if ( className ) { - - // Store className if set - dataPriv.set( this, "__className__", className ); - } - - // If the element has a class name or if we're passed `false`, - // then remove the whole classname (if there was one, the above saved it). - // Otherwise bring back whatever was previously saved (if anything), - // falling back to the empty string if nothing was stored. - if ( this.setAttribute ) { - this.setAttribute( "class", - className || value === false ? - "" : - dataPriv.get( this, "__className__" ) || "" - ); - } - } - } ); - }, - - hasClass: function( selector ) { - var className, elem, - i = 0; - - className = " " + selector + " "; - while ( ( elem = this[ i++ ] ) ) { - if ( elem.nodeType === 1 && - ( " " + stripAndCollapse( getClass( elem ) ) + " " ).indexOf( className ) > -1 ) { - return true; - } - } - - return false; - } -} ); - - - - -var rreturn = /\r/g; - -jQuery.fn.extend( { - val: function( value ) { - var hooks, ret, valueIsFunction, - elem = this[ 0 ]; - - if ( !arguments.length ) { - if ( elem ) { - hooks = jQuery.valHooks[ elem.type ] || - jQuery.valHooks[ elem.nodeName.toLowerCase() ]; - - if ( hooks && - "get" in hooks && - ( ret = hooks.get( elem, "value" ) ) !== undefined - ) { - return ret; - } - - ret = elem.value; - - // Handle most common string cases - if ( typeof ret === "string" ) { - return ret.replace( rreturn, "" ); - } - - // Handle cases where value is null/undef or number - return ret == null ? "" : ret; - } - - return; - } - - valueIsFunction = isFunction( value ); - - return this.each( function( i ) { - var val; - - if ( this.nodeType !== 1 ) { - return; - } - - if ( valueIsFunction ) { - val = value.call( this, i, jQuery( this ).val() ); - } else { - val = value; - } - - // Treat null/undefined as ""; convert numbers to string - if ( val == null ) { - val = ""; - - } else if ( typeof val === "number" ) { - val += ""; - - } else if ( Array.isArray( val ) ) { - val = jQuery.map( val, function( value ) { - return value == null ? "" : value + ""; - } ); - } - - hooks = jQuery.valHooks[ this.type ] || jQuery.valHooks[ this.nodeName.toLowerCase() ]; - - // If set returns undefined, fall back to normal setting - if ( !hooks || !( "set" in hooks ) || hooks.set( this, val, "value" ) === undefined ) { - this.value = val; - } - } ); - } -} ); - -jQuery.extend( { - valHooks: { - option: { - get: function( elem ) { - - var val = jQuery.find.attr( elem, "value" ); - return val != null ? - val : - - // Support: IE <=10 - 11 only - // option.text throws exceptions (#14686, #14858) - // Strip and collapse whitespace - // https://html.spec.whatwg.org/#strip-and-collapse-whitespace - stripAndCollapse( jQuery.text( elem ) ); - } - }, - select: { - get: function( elem ) { - var value, option, i, - options = elem.options, - index = elem.selectedIndex, - one = elem.type === "select-one", - values = one ? null : [], - max = one ? index + 1 : options.length; - - if ( index < 0 ) { - i = max; - - } else { - i = one ? index : 0; - } - - // Loop through all the selected options - for ( ; i < max; i++ ) { - option = options[ i ]; - - // Support: IE <=9 only - // IE8-9 doesn't update selected after form reset (#2551) - if ( ( option.selected || i === index ) && - - // Don't return options that are disabled or in a disabled optgroup - !option.disabled && - ( !option.parentNode.disabled || - !nodeName( option.parentNode, "optgroup" ) ) ) { - - // Get the specific value for the option - value = jQuery( option ).val(); - - // We don't need an array for one selects - if ( one ) { - return value; - } - - // Multi-Selects return an array - values.push( value ); - } - } - - return values; - }, - - set: function( elem, value ) { - var optionSet, option, - options = elem.options, - values = jQuery.makeArray( value ), - i = options.length; - - while ( i-- ) { - option = options[ i ]; - - /* eslint-disable no-cond-assign */ - - if ( option.selected = - jQuery.inArray( jQuery.valHooks.option.get( option ), values ) > -1 - ) { - optionSet = true; - } - - /* eslint-enable no-cond-assign */ - } - - // Force browsers to behave consistently when non-matching value is set - if ( !optionSet ) { - elem.selectedIndex = -1; - } - return values; - } - } - } -} ); - -// Radios and checkboxes getter/setter -jQuery.each( [ "radio", "checkbox" ], function() { - jQuery.valHooks[ this ] = { - set: function( elem, value ) { - if ( Array.isArray( value ) ) { - return ( elem.checked = jQuery.inArray( jQuery( elem ).val(), value ) > -1 ); - } - } - }; - if ( !support.checkOn ) { - jQuery.valHooks[ this ].get = function( elem ) { - return elem.getAttribute( "value" ) === null ? "on" : elem.value; - }; - } -} ); - - - - -// Return jQuery for attributes-only inclusion - - -support.focusin = "onfocusin" in window; - - -var rfocusMorph = /^(?:focusinfocus|focusoutblur)$/, - stopPropagationCallback = function( e ) { - e.stopPropagation(); - }; - -jQuery.extend( jQuery.event, { - - trigger: function( event, data, elem, onlyHandlers ) { - - var i, cur, tmp, bubbleType, ontype, handle, special, lastElement, - eventPath = [ elem || document ], - type = hasOwn.call( event, "type" ) ? event.type : event, - namespaces = hasOwn.call( event, "namespace" ) ? event.namespace.split( "." ) : []; - - cur = lastElement = tmp = elem = elem || document; - - // Don't do events on text and comment nodes - if ( elem.nodeType === 3 || elem.nodeType === 8 ) { - return; - } - - // focus/blur morphs to focusin/out; ensure we're not firing them right now - if ( rfocusMorph.test( type + jQuery.event.triggered ) ) { - return; - } - - if ( type.indexOf( "." ) > -1 ) { - - // Namespaced trigger; create a regexp to match event type in handle() - namespaces = type.split( "." ); - type = namespaces.shift(); - namespaces.sort(); - } - ontype = type.indexOf( ":" ) < 0 && "on" + type; - - // Caller can pass in a jQuery.Event object, Object, or just an event type string - event = event[ jQuery.expando ] ? - event : - new jQuery.Event( type, typeof event === "object" && event ); - - // Trigger bitmask: & 1 for native handlers; & 2 for jQuery (always true) - event.isTrigger = onlyHandlers ? 2 : 3; - event.namespace = namespaces.join( "." ); - event.rnamespace = event.namespace ? - new RegExp( "(^|\\.)" + namespaces.join( "\\.(?:.*\\.|)" ) + "(\\.|$)" ) : - null; - - // Clean up the event in case it is being reused - event.result = undefined; - if ( !event.target ) { - event.target = elem; - } - - // Clone any incoming data and prepend the event, creating the handler arg list - data = data == null ? - [ event ] : - jQuery.makeArray( data, [ event ] ); - - // Allow special events to draw outside the lines - special = jQuery.event.special[ type ] || {}; - if ( !onlyHandlers && special.trigger && special.trigger.apply( elem, data ) === false ) { - return; - } - - // Determine event propagation path in advance, per W3C events spec (#9951) - // Bubble up to document, then to window; watch for a global ownerDocument var (#9724) - if ( !onlyHandlers && !special.noBubble && !isWindow( elem ) ) { - - bubbleType = special.delegateType || type; - if ( !rfocusMorph.test( bubbleType + type ) ) { - cur = cur.parentNode; - } - for ( ; cur; cur = cur.parentNode ) { - eventPath.push( cur ); - tmp = cur; - } - - // Only add window if we got to document (e.g., not plain obj or detached DOM) - if ( tmp === ( elem.ownerDocument || document ) ) { - eventPath.push( tmp.defaultView || tmp.parentWindow || window ); - } - } - - // Fire handlers on the event path - i = 0; - while ( ( cur = eventPath[ i++ ] ) && !event.isPropagationStopped() ) { - lastElement = cur; - event.type = i > 1 ? - bubbleType : - special.bindType || type; - - // jQuery handler - handle = ( - dataPriv.get( cur, "events" ) || Object.create( null ) - )[ event.type ] && - dataPriv.get( cur, "handle" ); - if ( handle ) { - handle.apply( cur, data ); - } - - // Native handler - handle = ontype && cur[ ontype ]; - if ( handle && handle.apply && acceptData( cur ) ) { - event.result = handle.apply( cur, data ); - if ( event.result === false ) { - event.preventDefault(); - } - } - } - event.type = type; - - // If nobody prevented the default action, do it now - if ( !onlyHandlers && !event.isDefaultPrevented() ) { - - if ( ( !special._default || - special._default.apply( eventPath.pop(), data ) === false ) && - acceptData( elem ) ) { - - // Call a native DOM method on the target with the same name as the event. - // Don't do default actions on window, that's where global variables be (#6170) - if ( ontype && isFunction( elem[ type ] ) && !isWindow( elem ) ) { - - // Don't re-trigger an onFOO event when we call its FOO() method - tmp = elem[ ontype ]; - - if ( tmp ) { - elem[ ontype ] = null; - } - - // Prevent re-triggering of the same event, since we already bubbled it above - jQuery.event.triggered = type; - - if ( event.isPropagationStopped() ) { - lastElement.addEventListener( type, stopPropagationCallback ); - } - - elem[ type ](); - - if ( event.isPropagationStopped() ) { - lastElement.removeEventListener( type, stopPropagationCallback ); - } - - jQuery.event.triggered = undefined; - - if ( tmp ) { - elem[ ontype ] = tmp; - } - } - } - } - - return event.result; - }, - - // Piggyback on a donor event to simulate a different one - // Used only for `focus(in | out)` events - simulate: function( type, elem, event ) { - var e = jQuery.extend( - new jQuery.Event(), - event, - { - type: type, - isSimulated: true - } - ); - - jQuery.event.trigger( e, null, elem ); - } - -} ); - -jQuery.fn.extend( { - - trigger: function( type, data ) { - return this.each( function() { - jQuery.event.trigger( type, data, this ); - } ); - }, - triggerHandler: function( type, data ) { - var elem = this[ 0 ]; - if ( elem ) { - return jQuery.event.trigger( type, data, elem, true ); - } - } -} ); - - -// Support: Firefox <=44 -// Firefox doesn't have focus(in | out) events -// Related ticket - https://bugzilla.mozilla.org/show_bug.cgi?id=687787 -// -// Support: Chrome <=48 - 49, Safari <=9.0 - 9.1 -// focus(in | out) events fire after focus & blur events, -// which is spec violation - http://www.w3.org/TR/DOM-Level-3-Events/#events-focusevent-event-order -// Related ticket - https://bugs.chromium.org/p/chromium/issues/detail?id=449857 -if ( !support.focusin ) { - jQuery.each( { focus: "focusin", blur: "focusout" }, function( orig, fix ) { - - // Attach a single capturing handler on the document while someone wants focusin/focusout - var handler = function( event ) { - jQuery.event.simulate( fix, event.target, jQuery.event.fix( event ) ); - }; - - jQuery.event.special[ fix ] = { - setup: function() { - - // Handle: regular nodes (via `this.ownerDocument`), window - // (via `this.document`) & document (via `this`). - var doc = this.ownerDocument || this.document || this, - attaches = dataPriv.access( doc, fix ); - - if ( !attaches ) { - doc.addEventListener( orig, handler, true ); - } - dataPriv.access( doc, fix, ( attaches || 0 ) + 1 ); - }, - teardown: function() { - var doc = this.ownerDocument || this.document || this, - attaches = dataPriv.access( doc, fix ) - 1; - - if ( !attaches ) { - doc.removeEventListener( orig, handler, true ); - dataPriv.remove( doc, fix ); - - } else { - dataPriv.access( doc, fix, attaches ); - } - } - }; - } ); -} -var location = window.location; - -var nonce = { guid: Date.now() }; - -var rquery = ( /\?/ ); - - - -// Cross-browser xml parsing -jQuery.parseXML = function( data ) { - var xml; - if ( !data || typeof data !== "string" ) { - return null; - } - - // Support: IE 9 - 11 only - // IE throws on parseFromString with invalid input. - try { - xml = ( new window.DOMParser() ).parseFromString( data, "text/xml" ); - } catch ( e ) { - xml = undefined; - } - - if ( !xml || xml.getElementsByTagName( "parsererror" ).length ) { - jQuery.error( "Invalid XML: " + data ); - } - return xml; -}; - - -var - rbracket = /\[\]$/, - rCRLF = /\r?\n/g, - rsubmitterTypes = /^(?:submit|button|image|reset|file)$/i, - rsubmittable = /^(?:input|select|textarea|keygen)/i; - -function buildParams( prefix, obj, traditional, add ) { - var name; - - if ( Array.isArray( obj ) ) { - - // Serialize array item. - jQuery.each( obj, function( i, v ) { - if ( traditional || rbracket.test( prefix ) ) { - - // Treat each array item as a scalar. - add( prefix, v ); - - } else { - - // Item is non-scalar (array or object), encode its numeric index. - buildParams( - prefix + "[" + ( typeof v === "object" && v != null ? i : "" ) + "]", - v, - traditional, - add - ); - } - } ); - - } else if ( !traditional && toType( obj ) === "object" ) { - - // Serialize object item. - for ( name in obj ) { - buildParams( prefix + "[" + name + "]", obj[ name ], traditional, add ); - } - - } else { - - // Serialize scalar item. - add( prefix, obj ); - } -} - -// Serialize an array of form elements or a set of -// key/values into a query string -jQuery.param = function( a, traditional ) { - var prefix, - s = [], - add = function( key, valueOrFunction ) { - - // If value is a function, invoke it and use its return value - var value = isFunction( valueOrFunction ) ? - valueOrFunction() : - valueOrFunction; - - s[ s.length ] = encodeURIComponent( key ) + "=" + - encodeURIComponent( value == null ? "" : value ); - }; - - if ( a == null ) { - return ""; - } - - // If an array was passed in, assume that it is an array of form elements. - if ( Array.isArray( a ) || ( a.jquery && !jQuery.isPlainObject( a ) ) ) { - - // Serialize the form elements - jQuery.each( a, function() { - add( this.name, this.value ); - } ); - - } else { - - // If traditional, encode the "old" way (the way 1.3.2 or older - // did it), otherwise encode params recursively. - for ( prefix in a ) { - buildParams( prefix, a[ prefix ], traditional, add ); - } - } - - // Return the resulting serialization - return s.join( "&" ); -}; - -jQuery.fn.extend( { - serialize: function() { - return jQuery.param( this.serializeArray() ); - }, - serializeArray: function() { - return this.map( function() { - - // Can add propHook for "elements" to filter or add form elements - var elements = jQuery.prop( this, "elements" ); - return elements ? jQuery.makeArray( elements ) : this; - } ) - .filter( function() { - var type = this.type; - - // Use .is( ":disabled" ) so that fieldset[disabled] works - return this.name && !jQuery( this ).is( ":disabled" ) && - rsubmittable.test( this.nodeName ) && !rsubmitterTypes.test( type ) && - ( this.checked || !rcheckableType.test( type ) ); - } ) - .map( function( _i, elem ) { - var val = jQuery( this ).val(); - - if ( val == null ) { - return null; - } - - if ( Array.isArray( val ) ) { - return jQuery.map( val, function( val ) { - return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; - } ); - } - - return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; - } ).get(); - } -} ); - - -var - r20 = /%20/g, - rhash = /#.*$/, - rantiCache = /([?&])_=[^&]*/, - rheaders = /^(.*?):[ \t]*([^\r\n]*)$/mg, - - // #7653, #8125, #8152: local protocol detection - rlocalProtocol = /^(?:about|app|app-storage|.+-extension|file|res|widget):$/, - rnoContent = /^(?:GET|HEAD)$/, - rprotocol = /^\/\//, - - /* Prefilters - * 1) They are useful to introduce custom dataTypes (see ajax/jsonp.js for an example) - * 2) These are called: - * - BEFORE asking for a transport - * - AFTER param serialization (s.data is a string if s.processData is true) - * 3) key is the dataType - * 4) the catchall symbol "*" can be used - * 5) execution will start with transport dataType and THEN continue down to "*" if needed - */ - prefilters = {}, - - /* Transports bindings - * 1) key is the dataType - * 2) the catchall symbol "*" can be used - * 3) selection will start with transport dataType and THEN go to "*" if needed - */ - transports = {}, - - // Avoid comment-prolog char sequence (#10098); must appease lint and evade compression - allTypes = "*/".concat( "*" ), - - // Anchor tag for parsing the document origin - originAnchor = document.createElement( "a" ); - originAnchor.href = location.href; - -// Base "constructor" for jQuery.ajaxPrefilter and jQuery.ajaxTransport -function addToPrefiltersOrTransports( structure ) { - - // dataTypeExpression is optional and defaults to "*" - return function( dataTypeExpression, func ) { - - if ( typeof dataTypeExpression !== "string" ) { - func = dataTypeExpression; - dataTypeExpression = "*"; - } - - var dataType, - i = 0, - dataTypes = dataTypeExpression.toLowerCase().match( rnothtmlwhite ) || []; - - if ( isFunction( func ) ) { - - // For each dataType in the dataTypeExpression - while ( ( dataType = dataTypes[ i++ ] ) ) { - - // Prepend if requested - if ( dataType[ 0 ] === "+" ) { - dataType = dataType.slice( 1 ) || "*"; - ( structure[ dataType ] = structure[ dataType ] || [] ).unshift( func ); - - // Otherwise append - } else { - ( structure[ dataType ] = structure[ dataType ] || [] ).push( func ); - } - } - } - }; -} - -// Base inspection function for prefilters and transports -function inspectPrefiltersOrTransports( structure, options, originalOptions, jqXHR ) { - - var inspected = {}, - seekingTransport = ( structure === transports ); - - function inspect( dataType ) { - var selected; - inspected[ dataType ] = true; - jQuery.each( structure[ dataType ] || [], function( _, prefilterOrFactory ) { - var dataTypeOrTransport = prefilterOrFactory( options, originalOptions, jqXHR ); - if ( typeof dataTypeOrTransport === "string" && - !seekingTransport && !inspected[ dataTypeOrTransport ] ) { - - options.dataTypes.unshift( dataTypeOrTransport ); - inspect( dataTypeOrTransport ); - return false; - } else if ( seekingTransport ) { - return !( selected = dataTypeOrTransport ); - } - } ); - return selected; - } - - return inspect( options.dataTypes[ 0 ] ) || !inspected[ "*" ] && inspect( "*" ); -} - -// A special extend for ajax options -// that takes "flat" options (not to be deep extended) -// Fixes #9887 -function ajaxExtend( target, src ) { - var key, deep, - flatOptions = jQuery.ajaxSettings.flatOptions || {}; - - for ( key in src ) { - if ( src[ key ] !== undefined ) { - ( flatOptions[ key ] ? target : ( deep || ( deep = {} ) ) )[ key ] = src[ key ]; - } - } - if ( deep ) { - jQuery.extend( true, target, deep ); - } - - return target; -} - -/* Handles responses to an ajax request: - * - finds the right dataType (mediates between content-type and expected dataType) - * - returns the corresponding response - */ -function ajaxHandleResponses( s, jqXHR, responses ) { - - var ct, type, finalDataType, firstDataType, - contents = s.contents, - dataTypes = s.dataTypes; - - // Remove auto dataType and get content-type in the process - while ( dataTypes[ 0 ] === "*" ) { - dataTypes.shift(); - if ( ct === undefined ) { - ct = s.mimeType || jqXHR.getResponseHeader( "Content-Type" ); - } - } - - // Check if we're dealing with a known content-type - if ( ct ) { - for ( type in contents ) { - if ( contents[ type ] && contents[ type ].test( ct ) ) { - dataTypes.unshift( type ); - break; - } - } - } - - // Check to see if we have a response for the expected dataType - if ( dataTypes[ 0 ] in responses ) { - finalDataType = dataTypes[ 0 ]; - } else { - - // Try convertible dataTypes - for ( type in responses ) { - if ( !dataTypes[ 0 ] || s.converters[ type + " " + dataTypes[ 0 ] ] ) { - finalDataType = type; - break; - } - if ( !firstDataType ) { - firstDataType = type; - } - } - - // Or just use first one - finalDataType = finalDataType || firstDataType; - } - - // If we found a dataType - // We add the dataType to the list if needed - // and return the corresponding response - if ( finalDataType ) { - if ( finalDataType !== dataTypes[ 0 ] ) { - dataTypes.unshift( finalDataType ); - } - return responses[ finalDataType ]; - } -} - -/* Chain conversions given the request and the original response - * Also sets the responseXXX fields on the jqXHR instance - */ -function ajaxConvert( s, response, jqXHR, isSuccess ) { - var conv2, current, conv, tmp, prev, - converters = {}, - - // Work with a copy of dataTypes in case we need to modify it for conversion - dataTypes = s.dataTypes.slice(); - - // Create converters map with lowercased keys - if ( dataTypes[ 1 ] ) { - for ( conv in s.converters ) { - converters[ conv.toLowerCase() ] = s.converters[ conv ]; - } - } - - current = dataTypes.shift(); - - // Convert to each sequential dataType - while ( current ) { - - if ( s.responseFields[ current ] ) { - jqXHR[ s.responseFields[ current ] ] = response; - } - - // Apply the dataFilter if provided - if ( !prev && isSuccess && s.dataFilter ) { - response = s.dataFilter( response, s.dataType ); - } - - prev = current; - current = dataTypes.shift(); - - if ( current ) { - - // There's only work to do if current dataType is non-auto - if ( current === "*" ) { - - current = prev; - - // Convert response if prev dataType is non-auto and differs from current - } else if ( prev !== "*" && prev !== current ) { - - // Seek a direct converter - conv = converters[ prev + " " + current ] || converters[ "* " + current ]; - - // If none found, seek a pair - if ( !conv ) { - for ( conv2 in converters ) { - - // If conv2 outputs current - tmp = conv2.split( " " ); - if ( tmp[ 1 ] === current ) { - - // If prev can be converted to accepted input - conv = converters[ prev + " " + tmp[ 0 ] ] || - converters[ "* " + tmp[ 0 ] ]; - if ( conv ) { - - // Condense equivalence converters - if ( conv === true ) { - conv = converters[ conv2 ]; - - // Otherwise, insert the intermediate dataType - } else if ( converters[ conv2 ] !== true ) { - current = tmp[ 0 ]; - dataTypes.unshift( tmp[ 1 ] ); - } - break; - } - } - } - } - - // Apply converter (if not an equivalence) - if ( conv !== true ) { - - // Unless errors are allowed to bubble, catch and return them - if ( conv && s.throws ) { - response = conv( response ); - } else { - try { - response = conv( response ); - } catch ( e ) { - return { - state: "parsererror", - error: conv ? e : "No conversion from " + prev + " to " + current - }; - } - } - } - } - } - } - - return { state: "success", data: response }; -} - -jQuery.extend( { - - // Counter for holding the number of active queries - active: 0, - - // Last-Modified header cache for next request - lastModified: {}, - etag: {}, - - ajaxSettings: { - url: location.href, - type: "GET", - isLocal: rlocalProtocol.test( location.protocol ), - global: true, - processData: true, - async: true, - contentType: "application/x-www-form-urlencoded; charset=UTF-8", - - /* - timeout: 0, - data: null, - dataType: null, - username: null, - password: null, - cache: null, - throws: false, - traditional: false, - headers: {}, - */ - - accepts: { - "*": allTypes, - text: "text/plain", - html: "text/html", - xml: "application/xml, text/xml", - json: "application/json, text/javascript" - }, - - contents: { - xml: /\bxml\b/, - html: /\bhtml/, - json: /\bjson\b/ - }, - - responseFields: { - xml: "responseXML", - text: "responseText", - json: "responseJSON" - }, - - // Data converters - // Keys separate source (or catchall "*") and destination types with a single space - converters: { - - // Convert anything to text - "* text": String, - - // Text to html (true = no transformation) - "text html": true, - - // Evaluate text as a json expression - "text json": JSON.parse, - - // Parse text as xml - "text xml": jQuery.parseXML - }, - - // For options that shouldn't be deep extended: - // you can add your own custom options here if - // and when you create one that shouldn't be - // deep extended (see ajaxExtend) - flatOptions: { - url: true, - context: true - } - }, - - // Creates a full fledged settings object into target - // with both ajaxSettings and settings fields. - // If target is omitted, writes into ajaxSettings. - ajaxSetup: function( target, settings ) { - return settings ? - - // Building a settings object - ajaxExtend( ajaxExtend( target, jQuery.ajaxSettings ), settings ) : - - // Extending ajaxSettings - ajaxExtend( jQuery.ajaxSettings, target ); - }, - - ajaxPrefilter: addToPrefiltersOrTransports( prefilters ), - ajaxTransport: addToPrefiltersOrTransports( transports ), - - // Main method - ajax: function( url, options ) { - - // If url is an object, simulate pre-1.5 signature - if ( typeof url === "object" ) { - options = url; - url = undefined; - } - - // Force options to be an object - options = options || {}; - - var transport, - - // URL without anti-cache param - cacheURL, - - // Response headers - responseHeadersString, - responseHeaders, - - // timeout handle - timeoutTimer, - - // Url cleanup var - urlAnchor, - - // Request state (becomes false upon send and true upon completion) - completed, - - // To know if global events are to be dispatched - fireGlobals, - - // Loop variable - i, - - // uncached part of the url - uncached, - - // Create the final options object - s = jQuery.ajaxSetup( {}, options ), - - // Callbacks context - callbackContext = s.context || s, - - // Context for global events is callbackContext if it is a DOM node or jQuery collection - globalEventContext = s.context && - ( callbackContext.nodeType || callbackContext.jquery ) ? - jQuery( callbackContext ) : - jQuery.event, - - // Deferreds - deferred = jQuery.Deferred(), - completeDeferred = jQuery.Callbacks( "once memory" ), - - // Status-dependent callbacks - statusCode = s.statusCode || {}, - - // Headers (they are sent all at once) - requestHeaders = {}, - requestHeadersNames = {}, - - // Default abort message - strAbort = "canceled", - - // Fake xhr - jqXHR = { - readyState: 0, - - // Builds headers hashtable if needed - getResponseHeader: function( key ) { - var match; - if ( completed ) { - if ( !responseHeaders ) { - responseHeaders = {}; - while ( ( match = rheaders.exec( responseHeadersString ) ) ) { - responseHeaders[ match[ 1 ].toLowerCase() + " " ] = - ( responseHeaders[ match[ 1 ].toLowerCase() + " " ] || [] ) - .concat( match[ 2 ] ); - } - } - match = responseHeaders[ key.toLowerCase() + " " ]; - } - return match == null ? null : match.join( ", " ); - }, - - // Raw string - getAllResponseHeaders: function() { - return completed ? responseHeadersString : null; - }, - - // Caches the header - setRequestHeader: function( name, value ) { - if ( completed == null ) { - name = requestHeadersNames[ name.toLowerCase() ] = - requestHeadersNames[ name.toLowerCase() ] || name; - requestHeaders[ name ] = value; - } - return this; - }, - - // Overrides response content-type header - overrideMimeType: function( type ) { - if ( completed == null ) { - s.mimeType = type; - } - return this; - }, - - // Status-dependent callbacks - statusCode: function( map ) { - var code; - if ( map ) { - if ( completed ) { - - // Execute the appropriate callbacks - jqXHR.always( map[ jqXHR.status ] ); - } else { - - // Lazy-add the new callbacks in a way that preserves old ones - for ( code in map ) { - statusCode[ code ] = [ statusCode[ code ], map[ code ] ]; - } - } - } - return this; - }, - - // Cancel the request - abort: function( statusText ) { - var finalText = statusText || strAbort; - if ( transport ) { - transport.abort( finalText ); - } - done( 0, finalText ); - return this; - } - }; - - // Attach deferreds - deferred.promise( jqXHR ); - - // Add protocol if not provided (prefilters might expect it) - // Handle falsy url in the settings object (#10093: consistency with old signature) - // We also use the url parameter if available - s.url = ( ( url || s.url || location.href ) + "" ) - .replace( rprotocol, location.protocol + "//" ); - - // Alias method option to type as per ticket #12004 - s.type = options.method || options.type || s.method || s.type; - - // Extract dataTypes list - s.dataTypes = ( s.dataType || "*" ).toLowerCase().match( rnothtmlwhite ) || [ "" ]; - - // A cross-domain request is in order when the origin doesn't match the current origin. - if ( s.crossDomain == null ) { - urlAnchor = document.createElement( "a" ); - - // Support: IE <=8 - 11, Edge 12 - 15 - // IE throws exception on accessing the href property if url is malformed, - // e.g. http://example.com:80x/ - try { - urlAnchor.href = s.url; - - // Support: IE <=8 - 11 only - // Anchor's host property isn't correctly set when s.url is relative - urlAnchor.href = urlAnchor.href; - s.crossDomain = originAnchor.protocol + "//" + originAnchor.host !== - urlAnchor.protocol + "//" + urlAnchor.host; - } catch ( e ) { - - // If there is an error parsing the URL, assume it is crossDomain, - // it can be rejected by the transport if it is invalid - s.crossDomain = true; - } - } - - // Convert data if not already a string - if ( s.data && s.processData && typeof s.data !== "string" ) { - s.data = jQuery.param( s.data, s.traditional ); - } - - // Apply prefilters - inspectPrefiltersOrTransports( prefilters, s, options, jqXHR ); - - // If request was aborted inside a prefilter, stop there - if ( completed ) { - return jqXHR; - } - - // We can fire global events as of now if asked to - // Don't fire events if jQuery.event is undefined in an AMD-usage scenario (#15118) - fireGlobals = jQuery.event && s.global; - - // Watch for a new set of requests - if ( fireGlobals && jQuery.active++ === 0 ) { - jQuery.event.trigger( "ajaxStart" ); - } - - // Uppercase the type - s.type = s.type.toUpperCase(); - - // Determine if request has content - s.hasContent = !rnoContent.test( s.type ); - - // Save the URL in case we're toying with the If-Modified-Since - // and/or If-None-Match header later on - // Remove hash to simplify url manipulation - cacheURL = s.url.replace( rhash, "" ); - - // More options handling for requests with no content - if ( !s.hasContent ) { - - // Remember the hash so we can put it back - uncached = s.url.slice( cacheURL.length ); - - // If data is available and should be processed, append data to url - if ( s.data && ( s.processData || typeof s.data === "string" ) ) { - cacheURL += ( rquery.test( cacheURL ) ? "&" : "?" ) + s.data; - - // #9682: remove data so that it's not used in an eventual retry - delete s.data; - } - - // Add or update anti-cache param if needed - if ( s.cache === false ) { - cacheURL = cacheURL.replace( rantiCache, "$1" ); - uncached = ( rquery.test( cacheURL ) ? "&" : "?" ) + "_=" + ( nonce.guid++ ) + - uncached; - } - - // Put hash and anti-cache on the URL that will be requested (gh-1732) - s.url = cacheURL + uncached; - - // Change '%20' to '+' if this is encoded form body content (gh-2658) - } else if ( s.data && s.processData && - ( s.contentType || "" ).indexOf( "application/x-www-form-urlencoded" ) === 0 ) { - s.data = s.data.replace( r20, "+" ); - } - - // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. - if ( s.ifModified ) { - if ( jQuery.lastModified[ cacheURL ] ) { - jqXHR.setRequestHeader( "If-Modified-Since", jQuery.lastModified[ cacheURL ] ); - } - if ( jQuery.etag[ cacheURL ] ) { - jqXHR.setRequestHeader( "If-None-Match", jQuery.etag[ cacheURL ] ); - } - } - - // Set the correct header, if data is being sent - if ( s.data && s.hasContent && s.contentType !== false || options.contentType ) { - jqXHR.setRequestHeader( "Content-Type", s.contentType ); - } - - // Set the Accepts header for the server, depending on the dataType - jqXHR.setRequestHeader( - "Accept", - s.dataTypes[ 0 ] && s.accepts[ s.dataTypes[ 0 ] ] ? - s.accepts[ s.dataTypes[ 0 ] ] + - ( s.dataTypes[ 0 ] !== "*" ? ", " + allTypes + "; q=0.01" : "" ) : - s.accepts[ "*" ] - ); - - // Check for headers option - for ( i in s.headers ) { - jqXHR.setRequestHeader( i, s.headers[ i ] ); - } - - // Allow custom headers/mimetypes and early abort - if ( s.beforeSend && - ( s.beforeSend.call( callbackContext, jqXHR, s ) === false || completed ) ) { - - // Abort if not done already and return - return jqXHR.abort(); - } - - // Aborting is no longer a cancellation - strAbort = "abort"; - - // Install callbacks on deferreds - completeDeferred.add( s.complete ); - jqXHR.done( s.success ); - jqXHR.fail( s.error ); - - // Get transport - transport = inspectPrefiltersOrTransports( transports, s, options, jqXHR ); - - // If no transport, we auto-abort - if ( !transport ) { - done( -1, "No Transport" ); - } else { - jqXHR.readyState = 1; - - // Send global event - if ( fireGlobals ) { - globalEventContext.trigger( "ajaxSend", [ jqXHR, s ] ); - } - - // If request was aborted inside ajaxSend, stop there - if ( completed ) { - return jqXHR; - } - - // Timeout - if ( s.async && s.timeout > 0 ) { - timeoutTimer = window.setTimeout( function() { - jqXHR.abort( "timeout" ); - }, s.timeout ); - } - - try { - completed = false; - transport.send( requestHeaders, done ); - } catch ( e ) { - - // Rethrow post-completion exceptions - if ( completed ) { - throw e; - } - - // Propagate others as results - done( -1, e ); - } - } - - // Callback for when everything is done - function done( status, nativeStatusText, responses, headers ) { - var isSuccess, success, error, response, modified, - statusText = nativeStatusText; - - // Ignore repeat invocations - if ( completed ) { - return; - } - - completed = true; - - // Clear timeout if it exists - if ( timeoutTimer ) { - window.clearTimeout( timeoutTimer ); - } - - // Dereference transport for early garbage collection - // (no matter how long the jqXHR object will be used) - transport = undefined; - - // Cache response headers - responseHeadersString = headers || ""; - - // Set readyState - jqXHR.readyState = status > 0 ? 4 : 0; - - // Determine if successful - isSuccess = status >= 200 && status < 300 || status === 304; - - // Get response data - if ( responses ) { - response = ajaxHandleResponses( s, jqXHR, responses ); - } - - // Use a noop converter for missing script - if ( !isSuccess && jQuery.inArray( "script", s.dataTypes ) > -1 ) { - s.converters[ "text script" ] = function() {}; - } - - // Convert no matter what (that way responseXXX fields are always set) - response = ajaxConvert( s, response, jqXHR, isSuccess ); - - // If successful, handle type chaining - if ( isSuccess ) { - - // Set the If-Modified-Since and/or If-None-Match header, if in ifModified mode. - if ( s.ifModified ) { - modified = jqXHR.getResponseHeader( "Last-Modified" ); - if ( modified ) { - jQuery.lastModified[ cacheURL ] = modified; - } - modified = jqXHR.getResponseHeader( "etag" ); - if ( modified ) { - jQuery.etag[ cacheURL ] = modified; - } - } - - // if no content - if ( status === 204 || s.type === "HEAD" ) { - statusText = "nocontent"; - - // if not modified - } else if ( status === 304 ) { - statusText = "notmodified"; - - // If we have data, let's convert it - } else { - statusText = response.state; - success = response.data; - error = response.error; - isSuccess = !error; - } - } else { - - // Extract error from statusText and normalize for non-aborts - error = statusText; - if ( status || !statusText ) { - statusText = "error"; - if ( status < 0 ) { - status = 0; - } - } - } - - // Set data for the fake xhr object - jqXHR.status = status; - jqXHR.statusText = ( nativeStatusText || statusText ) + ""; - - // Success/Error - if ( isSuccess ) { - deferred.resolveWith( callbackContext, [ success, statusText, jqXHR ] ); - } else { - deferred.rejectWith( callbackContext, [ jqXHR, statusText, error ] ); - } - - // Status-dependent callbacks - jqXHR.statusCode( statusCode ); - statusCode = undefined; - - if ( fireGlobals ) { - globalEventContext.trigger( isSuccess ? "ajaxSuccess" : "ajaxError", - [ jqXHR, s, isSuccess ? success : error ] ); - } - - // Complete - completeDeferred.fireWith( callbackContext, [ jqXHR, statusText ] ); - - if ( fireGlobals ) { - globalEventContext.trigger( "ajaxComplete", [ jqXHR, s ] ); - - // Handle the global AJAX counter - if ( !( --jQuery.active ) ) { - jQuery.event.trigger( "ajaxStop" ); - } - } - } - - return jqXHR; - }, - - getJSON: function( url, data, callback ) { - return jQuery.get( url, data, callback, "json" ); - }, - - getScript: function( url, callback ) { - return jQuery.get( url, undefined, callback, "script" ); - } -} ); - -jQuery.each( [ "get", "post" ], function( _i, method ) { - jQuery[ method ] = function( url, data, callback, type ) { - - // Shift arguments if data argument was omitted - if ( isFunction( data ) ) { - type = type || callback; - callback = data; - data = undefined; - } - - // The url can be an options object (which then must have .url) - return jQuery.ajax( jQuery.extend( { - url: url, - type: method, - dataType: type, - data: data, - success: callback - }, jQuery.isPlainObject( url ) && url ) ); - }; -} ); - -jQuery.ajaxPrefilter( function( s ) { - var i; - for ( i in s.headers ) { - if ( i.toLowerCase() === "content-type" ) { - s.contentType = s.headers[ i ] || ""; - } - } -} ); - - -jQuery._evalUrl = function( url, options, doc ) { - return jQuery.ajax( { - url: url, - - // Make this explicit, since user can override this through ajaxSetup (#11264) - type: "GET", - dataType: "script", - cache: true, - async: false, - global: false, - - // Only evaluate the response if it is successful (gh-4126) - // dataFilter is not invoked for failure responses, so using it instead - // of the default converter is kludgy but it works. - converters: { - "text script": function() {} - }, - dataFilter: function( response ) { - jQuery.globalEval( response, options, doc ); - } - } ); -}; - - -jQuery.fn.extend( { - wrapAll: function( html ) { - var wrap; - - if ( this[ 0 ] ) { - if ( isFunction( html ) ) { - html = html.call( this[ 0 ] ); - } - - // The elements to wrap the target around - wrap = jQuery( html, this[ 0 ].ownerDocument ).eq( 0 ).clone( true ); - - if ( this[ 0 ].parentNode ) { - wrap.insertBefore( this[ 0 ] ); - } - - wrap.map( function() { - var elem = this; - - while ( elem.firstElementChild ) { - elem = elem.firstElementChild; - } - - return elem; - } ).append( this ); - } - - return this; - }, - - wrapInner: function( html ) { - if ( isFunction( html ) ) { - return this.each( function( i ) { - jQuery( this ).wrapInner( html.call( this, i ) ); - } ); - } - - return this.each( function() { - var self = jQuery( this ), - contents = self.contents(); - - if ( contents.length ) { - contents.wrapAll( html ); - - } else { - self.append( html ); - } - } ); - }, - - wrap: function( html ) { - var htmlIsFunction = isFunction( html ); - - return this.each( function( i ) { - jQuery( this ).wrapAll( htmlIsFunction ? html.call( this, i ) : html ); - } ); - }, - - unwrap: function( selector ) { - this.parent( selector ).not( "body" ).each( function() { - jQuery( this ).replaceWith( this.childNodes ); - } ); - return this; - } -} ); - - -jQuery.expr.pseudos.hidden = function( elem ) { - return !jQuery.expr.pseudos.visible( elem ); -}; -jQuery.expr.pseudos.visible = function( elem ) { - return !!( elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length ); -}; - - - - -jQuery.ajaxSettings.xhr = function() { - try { - return new window.XMLHttpRequest(); - } catch ( e ) {} -}; - -var xhrSuccessStatus = { - - // File protocol always yields status code 0, assume 200 - 0: 200, - - // Support: IE <=9 only - // #1450: sometimes IE returns 1223 when it should be 204 - 1223: 204 - }, - xhrSupported = jQuery.ajaxSettings.xhr(); - -support.cors = !!xhrSupported && ( "withCredentials" in xhrSupported ); -support.ajax = xhrSupported = !!xhrSupported; - -jQuery.ajaxTransport( function( options ) { - var callback, errorCallback; - - // Cross domain only allowed if supported through XMLHttpRequest - if ( support.cors || xhrSupported && !options.crossDomain ) { - return { - send: function( headers, complete ) { - var i, - xhr = options.xhr(); - - xhr.open( - options.type, - options.url, - options.async, - options.username, - options.password - ); - - // Apply custom fields if provided - if ( options.xhrFields ) { - for ( i in options.xhrFields ) { - xhr[ i ] = options.xhrFields[ i ]; - } - } - - // Override mime type if needed - if ( options.mimeType && xhr.overrideMimeType ) { - xhr.overrideMimeType( options.mimeType ); - } - - // X-Requested-With header - // For cross-domain requests, seeing as conditions for a preflight are - // akin to a jigsaw puzzle, we simply never set it to be sure. - // (it can always be set on a per-request basis or even using ajaxSetup) - // For same-domain requests, won't change header if already provided. - if ( !options.crossDomain && !headers[ "X-Requested-With" ] ) { - headers[ "X-Requested-With" ] = "XMLHttpRequest"; - } - - // Set headers - for ( i in headers ) { - xhr.setRequestHeader( i, headers[ i ] ); - } - - // Callback - callback = function( type ) { - return function() { - if ( callback ) { - callback = errorCallback = xhr.onload = - xhr.onerror = xhr.onabort = xhr.ontimeout = - xhr.onreadystatechange = null; - - if ( type === "abort" ) { - xhr.abort(); - } else if ( type === "error" ) { - - // Support: IE <=9 only - // On a manual native abort, IE9 throws - // errors on any property access that is not readyState - if ( typeof xhr.status !== "number" ) { - complete( 0, "error" ); - } else { - complete( - - // File: protocol always yields status 0; see #8605, #14207 - xhr.status, - xhr.statusText - ); - } - } else { - complete( - xhrSuccessStatus[ xhr.status ] || xhr.status, - xhr.statusText, - - // Support: IE <=9 only - // IE9 has no XHR2 but throws on binary (trac-11426) - // For XHR2 non-text, let the caller handle it (gh-2498) - ( xhr.responseType || "text" ) !== "text" || - typeof xhr.responseText !== "string" ? - { binary: xhr.response } : - { text: xhr.responseText }, - xhr.getAllResponseHeaders() - ); - } - } - }; - }; - - // Listen to events - xhr.onload = callback(); - errorCallback = xhr.onerror = xhr.ontimeout = callback( "error" ); - - // Support: IE 9 only - // Use onreadystatechange to replace onabort - // to handle uncaught aborts - if ( xhr.onabort !== undefined ) { - xhr.onabort = errorCallback; - } else { - xhr.onreadystatechange = function() { - - // Check readyState before timeout as it changes - if ( xhr.readyState === 4 ) { - - // Allow onerror to be called first, - // but that will not handle a native abort - // Also, save errorCallback to a variable - // as xhr.onerror cannot be accessed - window.setTimeout( function() { - if ( callback ) { - errorCallback(); - } - } ); - } - }; - } - - // Create the abort callback - callback = callback( "abort" ); - - try { - - // Do send the request (this may raise an exception) - xhr.send( options.hasContent && options.data || null ); - } catch ( e ) { - - // #14683: Only rethrow if this hasn't been notified as an error yet - if ( callback ) { - throw e; - } - } - }, - - abort: function() { - if ( callback ) { - callback(); - } - } - }; - } -} ); - - - - -// Prevent auto-execution of scripts when no explicit dataType was provided (See gh-2432) -jQuery.ajaxPrefilter( function( s ) { - if ( s.crossDomain ) { - s.contents.script = false; - } -} ); - -// Install script dataType -jQuery.ajaxSetup( { - accepts: { - script: "text/javascript, application/javascript, " + - "application/ecmascript, application/x-ecmascript" - }, - contents: { - script: /\b(?:java|ecma)script\b/ - }, - converters: { - "text script": function( text ) { - jQuery.globalEval( text ); - return text; - } - } -} ); - -// Handle cache's special case and crossDomain -jQuery.ajaxPrefilter( "script", function( s ) { - if ( s.cache === undefined ) { - s.cache = false; - } - if ( s.crossDomain ) { - s.type = "GET"; - } -} ); - -// Bind script tag hack transport -jQuery.ajaxTransport( "script", function( s ) { - - // This transport only deals with cross domain or forced-by-attrs requests - if ( s.crossDomain || s.scriptAttrs ) { - var script, callback; - return { - send: function( _, complete ) { - script = jQuery( " + + @@ -29,7 +31,7 @@ - + @@ -73,15 +75,68 @@
Here are the classes, structs, unions and interfaces with brief descriptions:
-
[detail level 12]
- - +
[detail level 12]
 Nfbgemm_gpu
 CComparatorWarp bitonic K/V sorting code from @jhj
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 Nfbgemm_gpu
 CBitonicSort
 CComparatorWarp bitonic K/V sorting code
 CDefaultPtrTraits
 Cenum_registration
 CFixedDivisor
 CGenericPackedTensorAccessor
 CGenericPackedTensorAccessor< T, 1, PtrTraits, index_t >
 CGenericPackedTensorAccessorBase
 CHalf4
 CPermutePooledEmbsFunction
 CPermutePooledEmbsFunctionSplit
 Crk_state
 CSharedMemory
 CSharedMemory< double >
 CSharedMemory< float >
 CSharedMemory< int32_t >
 CSharedMemory< int64_t >
 CSharedMemory< Vec4T< at::acc_type< double, true > > >
 CSharedMemory< Vec4T< at::acc_type< float, true > > >
 CStochasticRoundingRNGState
 CTensorAccessor
 CTensorAccessor< T, 1, PtrTraits, index_t >
 CTensorAccessorBase
 CVec4AccT
 CVec4StepT
 CVec4StepT< STEP, at::Half >
 CVec4StepT< STEP, float >
 CVec4StepT< STEP, uint8_t >
 CVec4T
 CVec4T< at::BFloat16 >
 CVec4T< at::Half >
 CVec4T< double >
 CVec4T< float >
 CVecNT
 CVecNT< 1, PrimitiveType::FP >
 CVecNT< 16, PrimitiveType::INT >
 CVecNT< 2, PrimitiveType::FP >
 CVecNT< 4, PrimitiveType::FP >
 CVecNT< 4, PrimitiveType::INT >
 CVecNT< 8, PrimitiveType::INT >
 CWeightRow
 Ninternal
 CHyperCompressedSparseColumn
 Nssd
 CEmbeddingRocksDB
 CInitializer
 Clog2_calc
 Clog2_calc_
 Clog2_calc_< 0 >
 CStackArray
 CVec4Type
 CVec4Type< at::Half >
 CVec4Type< float >
 CVec4Type< uint8_t >
diff --git a/batch__index__select__dim0__cpu__host_8cpp.html b/batch__index__select__dim0__cpu__host_8cpp.html new file mode 100644 index 000000000..9e3f63b23 --- /dev/null +++ b/batch__index__select__dim0__cpu__host_8cpp.html @@ -0,0 +1,193 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/batch_index_select_dim0_cpu_host.cpp File Reference + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+
batch_index_select_dim0_cpu_host.cpp File Reference
+
+
+
#include <ATen/ATen.h>
+#include <ATen/TypeDefault.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <torch/script.h>
+#include "fbgemm_gpu/embedding_common.h"
+#include "fbgemm_gpu/sparse_ops.h"
+#include "fbgemm_gpu/sparse_ops_utils.h"
+

Typedef Documentation

+ +

◆ Tensor

+ +
+
+ + + + +
using Tensor = at::Tensor
+
+ +
+
+

Function Documentation

+ +

◆ batch_index_select_dim0_cpu()

+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Tensor batch_index_select_dim0_cpu (Tensor inputs,
Tensor indices,
std::vector< int64_t > input_num_indices,
std::vector< int64_t > input_rows,
std::vector< int64_t > input_columns,
const bool permute_output_dim_0_1 )
+
+ +
+
+ +

◆ TORCH_LIBRARY_FRAGMENT() [1/2]

+ +
+
+ + + + + + + + + + + +
TORCH_LIBRARY_FRAGMENT (fb ,
m  )
+
+ +
+
+ +

◆ TORCH_LIBRARY_FRAGMENT() [2/2]

+ +
+
+ + + + + + + + + + + +
TORCH_LIBRARY_FRAGMENT (fbgemm ,
m  )
+
+ +
+
+
+ + + + diff --git a/batch__index__select__dim0__host_8cpp.html b/batch__index__select__dim0__host_8cpp.html new file mode 100644 index 000000000..80de2bca2 --- /dev/null +++ b/batch__index__select__dim0__host_8cpp.html @@ -0,0 +1,345 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/batch_index_select_dim0_host.cpp File Reference + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+
batch_index_select_dim0_host.cpp File Reference
+
+
+
#include <ATen/ATen.h>
+#include <ATen/TypeDefault.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <torch/script.h>
+#include "fbgemm_gpu/embedding_common.h"
+#include "fbgemm_gpu/sparse_ops.h"
+#include "fbgemm_gpu/sparse_ops_utils.h"
+

Typedef Documentation

+ +

◆ Tensor

+ +
+
+ + + + +
using Tensor = at::Tensor
+
+ +
+
+

Function Documentation

+ +

◆ batch_index_select_dim0_codegen_backward_cuda()

+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Tensor batch_index_select_dim0_codegen_backward_cuda (const Tensor & grad_output,
const Tensor & dev_weights,
const Tensor & weights_offsets,
const Tensor & D_offsets,
const int64_t max_D,
const Tensor & hash_size_cumsum,
const int64_t total_hash_size_bits,
const Tensor & indices,
const int64_t max_segment_length_per_warp,
const Tensor & grad_offsets,
const Tensor & total_L_offsets,
const int32_t fixed_L_per_warp,
const int32_t num_warps_per_feature,
const bool permute_output_dim_0_1 )
+
+ +
+
+ +

◆ batch_index_select_dim0_codegen_forward_cuda()

+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Tensor batch_index_select_dim0_codegen_forward_cuda (const Tensor & dev_weights,
const Tensor & weights_offsets,
const Tensor & D_offsets,
const int64_t max_D,
const Tensor & indices,
const int64_t output_dtype,
const Tensor & output_offsets,
const Tensor & total_L_offsets,
const int64_t output_size,
const int32_t fixed_L_per_warp,
const int32_t num_warps_per_feature,
const bool permute_output_dim_0_1 )
+
+ +
+
+ +

◆ batch_index_select_dim0_gpu()

+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Tensor batch_index_select_dim0_gpu (Tensor inputs,
Tensor indices,
std::vector< int64_t > input_num_indices,
std::vector< int64_t > input_rows,
std::vector< int64_t > input_columns,
const bool permute_output_dim_0_1 )
+
+ +
+
+ +

◆ TORCH_LIBRARY_FRAGMENT() [1/2]

+ +
+
+ + + + + + + + + + + +
TORCH_LIBRARY_FRAGMENT (fb ,
m  )
+
+ +
+
+ +

◆ TORCH_LIBRARY_FRAGMENT() [2/2]

+ +
+
+ + + + + + + + + + + +
TORCH_LIBRARY_FRAGMENT (fbgemm ,
m  )
+
+ +
+
+
+ + + + diff --git a/batched__dense__vec__jagged__2d__mul__backward_8cu.html b/batched__dense__vec__jagged__2d__mul__backward_8cu.html new file mode 100644 index 000000000..71735d7bf --- /dev/null +++ b/batched__dense__vec__jagged__2d__mul__backward_8cu.html @@ -0,0 +1,138 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_backward.cu File Reference + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+ +
batched_dense_vec_jagged_2d_mul_backward.cu File Reference
+
+
+
#include "common.cuh"
+
+ + + +

+Namespaces

namespace  fbgemm_gpu
 
+

Typedef Documentation

+ +

◆ Tensor

+ +
+
+ + + + +
using Tensor = at::Tensor
+
+ +
+
+

Function Documentation

+ +

◆ FBGEMM_OP_DISPATCH()

+ +
+
+ + + + + + + + + + + + + + + + +
FBGEMM_OP_DISPATCH (CUDA ,
"batched_dense_vec_jagged_2d_mul_backward" ,
fbgemm_gpu::batched_dense_vec_jagged_2d_mul_backward  )
+
+ +
+
+
+ + + + diff --git a/batched__dense__vec__jagged__2d__mul__forward_8cu.html b/batched__dense__vec__jagged__2d__mul__forward_8cu.html new file mode 100644 index 000000000..19f141cb7 --- /dev/null +++ b/batched__dense__vec__jagged__2d__mul__forward_8cu.html @@ -0,0 +1,138 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/batched_dense_vec_jagged_2d_mul_forward.cu File Reference + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+ +
batched_dense_vec_jagged_2d_mul_forward.cu File Reference
+
+
+
#include "common.cuh"
+
+ + + +

+Namespaces

namespace  fbgemm_gpu
 
+

Typedef Documentation

+ +

◆ Tensor

+ +
+
+ + + + +
using Tensor = at::Tensor
+
+ +
+
+

Function Documentation

+ +

◆ FBGEMM_OP_DISPATCH()

+ +
+
+ + + + + + + + + + + + + + + + +
FBGEMM_OP_DISPATCH (CUDA ,
"batched_dense_vec_jagged_2d_mul_forward" ,
fbgemm_gpu::batched_dense_vec_jagged_2d_mul_forward  )
+
+ +
+
+
+ + + + diff --git a/bench__utils_8cuh.html b/bench__utils_8cuh.html new file mode 100644 index 000000000..eceedcb78 --- /dev/null +++ b/bench__utils_8cuh.html @@ -0,0 +1,238 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/bench_utils.cuh File Reference + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+
bench_utils.cuh File Reference
+
+
+
#include <c10/cuda/CUDAException.h>
+#include <cuda.h>
+#include <curand.h>
+#include <curand_kernel.h>
+#include <vector>
+#include "./cuda_utils.cuh"
+

Function Documentation

+ +

◆ __launch_bounds__()

+ +
+
+ + + + + + + +
__global__ __launch_bounds__ (kMaxThreads )
+
+ +
+
+ +

◆ benchmark_function()

+ +
+
+
+template<typename Lambda >
+ + + + + + + + + + + +
float benchmark_function (int iters,
Lambda && f )
+
+ +
+
+ +

◆ flush_cache()

+ +
+
+ + + + + + + + + + + +
void flush_cache (int cache_size_mb = 40,
bool do_write = false )
+
+ +
+
+ +

◆ generate_random_table()

+ +
+
+ + + + + + + + + + + +
void generate_random_table (float * d_f32_table,
unsigned size )
+
+ +
+
+ +

◆ if()

+ +
+
+ + + + + + + +
if (do_write * val)
+
+ +
+
+

Variable Documentation

+ +

◆ d_flush2

+ +
+
+ + + + +
__global__ char* d_flush2
+
+ +
+
+ +

◆ do_write

+ +
+
+ + + + +
__global__ char bool do_write
+
+Initial value:
{
+
int idx = blockIdx.x * blockDim.x + threadIdx.x
+
+
+
+ +

◆ val

+ +
+
+ + + + +
char val = d_flush[idx]
+
+ +
+
+
+ + + + diff --git a/classes.html b/classes.html index 24145a841..c4683d8ed 100644 --- a/classes.html +++ b/classes.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: Class Index + + @@ -29,7 +31,7 @@ - + @@ -72,16 +74,58 @@
Class Index
- +
B | C | D | E | F | G | H | I | L | P | R | S | T | V | W
+
B
+
BitonicSort (fbgemm_gpu)
+
C
-
Comparator (fbgemm_gpu)
+
Comparator (fbgemm_gpu)
+
+
D
+
DefaultPtrTraits (fbgemm_gpu)
+
+
E
+
EmbeddingRocksDB (ssd)
enum_registration (fbgemm_gpu)
+
+
F
+
FixedDivisor (fbgemm_gpu)
+
+
G
+
GenericPackedTensorAccessor (fbgemm_gpu)
GenericPackedTensorAccessor< T, 1, PtrTraits, index_t > (fbgemm_gpu)
GenericPackedTensorAccessorBase (fbgemm_gpu)
+
+
H
+
Half4 (fbgemm_gpu)
HyperCompressedSparseColumn (internal)
+
+
I
+
Initializer (ssd)
+
+
L
+
log2_calc
log2_calc_
log2_calc_< 0 >
+
+
P
+
PermutePooledEmbsFunction (fbgemm_gpu)
PermutePooledEmbsFunctionSplit (fbgemm_gpu)
+
+
R
+
rk_state (fbgemm_gpu)
+
+
S
+
SharedMemory (fbgemm_gpu)
SharedMemory< double > (fbgemm_gpu)
SharedMemory< float > (fbgemm_gpu)
SharedMemory< int32_t > (fbgemm_gpu)
SharedMemory< int64_t > (fbgemm_gpu)
SharedMemory< Vec4T< at::acc_type< double, true > > > (fbgemm_gpu)
SharedMemory< Vec4T< at::acc_type< float, true > > > (fbgemm_gpu)
StackArray
StochasticRoundingRNGState (fbgemm_gpu)
+
+
T
+
TensorAccessor (fbgemm_gpu)
TensorAccessor< T, 1, PtrTraits, index_t > (fbgemm_gpu)
TensorAccessorBase (fbgemm_gpu)
+
+
V
+
Vec4AccT (fbgemm_gpu)
Vec4StepT (fbgemm_gpu)
Vec4StepT< STEP, at::Half > (fbgemm_gpu)
Vec4StepT< STEP, float > (fbgemm_gpu)
Vec4StepT< STEP, uint8_t > (fbgemm_gpu)
Vec4T (fbgemm_gpu)
Vec4T< at::BFloat16 > (fbgemm_gpu)
Vec4T< at::Half > (fbgemm_gpu)
Vec4T< double > (fbgemm_gpu)
Vec4T< float > (fbgemm_gpu)
Vec4Type
Vec4Type< at::Half >
Vec4Type< float >
Vec4Type< uint8_t >
VecNT (fbgemm_gpu)
VecNT< 1, PrimitiveType::FP > (fbgemm_gpu)
VecNT< 16, PrimitiveType::INT > (fbgemm_gpu)
VecNT< 2, PrimitiveType::FP > (fbgemm_gpu)
VecNT< 4, PrimitiveType::FP > (fbgemm_gpu)
VecNT< 4, PrimitiveType::INT > (fbgemm_gpu)
VecNT< 8, PrimitiveType::INT > (fbgemm_gpu)
+
+
W
+
WeightRow (fbgemm_gpu)
diff --git a/classfbgemm__gpu_1_1_fixed_divisor-members.html b/classfbgemm__gpu_1_1_fixed_divisor-members.html new file mode 100644 index 000000000..f7e47aa01 --- /dev/null +++ b/classfbgemm__gpu_1_1_fixed_divisor-members.html @@ -0,0 +1,95 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+
FixedDivisor Member List
+
+
+ +

This is the complete list of members for FixedDivisor, including all inherited members.

+ + + + + + +
D() constFixedDivisorinline
Div(const int32_t n) constFixedDivisorinline
DivMod(const int32_t n, int32_t *q, int32_t *r) constFixedDivisorinline
FixedDivisor(const int32_t d)FixedDivisorinlineexplicit
Mod(const int32_t n) constFixedDivisorinline
+ + + + diff --git a/classfbgemm__gpu_1_1_fixed_divisor.html b/classfbgemm__gpu_1_1_fixed_divisor.html new file mode 100644 index 000000000..80b8065f8 --- /dev/null +++ b/classfbgemm__gpu_1_1_fixed_divisor.html @@ -0,0 +1,234 @@ + + + + + + + +fbgemm_gpu: FixedDivisor Class Reference + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+ +
FixedDivisor Class Reference
+
+
+

Constructor & Destructor Documentation

+ +

◆ FixedDivisor()

+ +
+
+ + + + + +
+ + + + + + + +
FixedDivisor (const int32_t d)
+
+inlineexplicit
+
+ +
+
+

Member Function Documentation

+ +

◆ D()

+ +
+
+ + + + + +
+ + + + + + + +
DEVICE_INLINE int32_t D () const
+
+inline
+
+ +
+
+ +

◆ Div()

+ +
+
+ + + + + +
+ + + + + + + +
DEVICE_INLINE int32_t Div (const int32_t n) const
+
+inline
+
+ +

Calculates q = n / d.

+ +
+
+ +

◆ DivMod()

+ +
+
+ + + + + +
+ + + + + + + + + + + + + + + + +
DEVICE_INLINE void DivMod (const int32_t n,
int32_t * q,
int32_t * r ) const
+
+inline
+
+ +

Calculates q = n / d and r = n % d together.

+ +
+
+ +

◆ Mod()

+ +
+
+ + + + + +
+ + + + + + + +
DEVICE_INLINE int32_t Mod (const int32_t n) const
+
+inline
+
+ +

Calculates r = n % d.

+ +
+
+
The documentation for this class was generated from the following file: +
+ + + + diff --git a/classfbgemm__gpu_1_1_generic_packed_tensor_accessor-members.html b/classfbgemm__gpu_1_1_generic_packed_tensor_accessor-members.html new file mode 100644 index 000000000..ffc7b19ad --- /dev/null +++ b/classfbgemm__gpu_1_1_generic_packed_tensor_accessor-members.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+
GenericPackedTensorAccessor< T, N, PtrTraits, index_t > Member List
+
+
+ +

This is the complete list of members for GenericPackedTensorAccessor< T, N, PtrTraits, index_t >, including all inherited members.

+ + + + + + + + + + + + + + + + + + + + + + +
at(index_t idx) constGenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >inline
bounds_check_(index_t i) constGenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >inlineprotected
copy_str(char *dst, const char *src, const size_t max_len)GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >inline
data()GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >inline
data() constGenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >inline
data_GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >protected
func_name_GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >protected
GenericPackedTensorAccessor(PtrType data, const index_t *const sizes, const index_t *const strides, const char *const ptr_name, const char *const func_name)GenericPackedTensorAccessor< T, N, PtrTraits, index_t >inline
GenericPackedTensorAccessor(PtrType data, const source_index_t *const sizes, const source_index_t *const strides, const char *const ptr_name, const char *const func_name)GenericPackedTensorAccessor< T, N, PtrTraits, index_t >inline
GenericPackedTensorAccessorBase(PtrType data, const index_t *const sizes, const index_t *const strides, const char *const ptr_name, const char *const func_name)GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >inline
GenericPackedTensorAccessorBase(PtrType data, const source_index_t *const sizes, const source_index_t *const strides, const char *const ptr_name, const char *const func_name)GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >inline
numel_GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >protected
operator[](index_t i)GenericPackedTensorAccessor< T, N, PtrTraits, index_t >inline
operator[](index_t i) constGenericPackedTensorAccessor< T, N, PtrTraits, index_t >inline
ptr_name_GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >protected
PtrType typedefGenericPackedTensorAccessor< T, N, PtrTraits, index_t >
size(index_t i) constGenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >inline
sizes_GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >protected
stride(index_t i) constGenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >inline
strides_GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >protected
transpose(index_t dim1, index_t dim2) constGenericPackedTensorAccessor< T, N, PtrTraits, index_t >inline
+ + + + diff --git a/classfbgemm__gpu_1_1_generic_packed_tensor_accessor.html b/classfbgemm__gpu_1_1_generic_packed_tensor_accessor.html new file mode 100644 index 000000000..e43469bd8 --- /dev/null +++ b/classfbgemm__gpu_1_1_generic_packed_tensor_accessor.html @@ -0,0 +1,309 @@ + + + + + + + +fbgemm_gpu: GenericPackedTensorAccessor< T, N, PtrTraits, index_t > Class Template Reference + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+ +
GenericPackedTensorAccessor< T, N, PtrTraits, index_t > Class Template Reference
+
+
+ +

#include <fbgemm_tensor_accessor.h>

+
+Inheritance diagram for GenericPackedTensorAccessor< T, N, PtrTraits, index_t >:
+
+
+ + +GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t > + +
+ + + + +

+Public Member Functions

C10_HOST GenericPackedTensorAccessor< T, N, PtrTraits, index_ttranspose (index_t dim1, index_t dim2) const
 
+

Member Typedef Documentation

+ +

◆ PtrType

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + +
typedef PtrTraits<T>::PtrType PtrType
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ GenericPackedTensorAccessor() [1/2]

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
C10_HOST GenericPackedTensorAccessor (PtrType data,
const index_t *const sizes,
const index_t *const strides,
const char *const ptr_name,
const char *const func_name )
+
+inline
+
+ +
+
+ +

◆ GenericPackedTensorAccessor() [2/2]

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+
+template<typename source_index_t , class = typename std::enable_if< std::is_same<source_index_t, int64_t>::value>::type>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
C10_HOST GenericPackedTensorAccessor (PtrType data,
const source_index_t *const sizes,
const source_index_t *const strides,
const char *const ptr_name,
const char *const func_name )
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ operator[]() [1/2]

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + +
C10_DEVICE TensorAccessor< T, N - 1, PtrTraits, index_t > operator[] (index_t i)
+
+inline
+
+ +
+
+ +

◆ operator[]() [2/2]

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + +
C10_DEVICE const TensorAccessor< T, N - 1, PtrTraits, index_t > operator[] (index_t i) const
+
+inline
+
+ +
+
+ +

◆ transpose()

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + + + + + +
C10_HOST GenericPackedTensorAccessor< T, N, PtrTraits, index_t > transpose (index_t dim1,
index_t dim2 ) const
+
+inline
+
+

Returns a PackedTensorAccessor of the same dimension after transposing the two dimensions given. Does not actually move elements; transposition is made by permuting the size/stride arrays. If the dimensions are not valid, asserts.

+ +
+
+
The documentation for this class was generated from the following file: +
+ + + + diff --git a/classfbgemm__gpu_1_1_generic_packed_tensor_accessor.png b/classfbgemm__gpu_1_1_generic_packed_tensor_accessor.png new file mode 100644 index 000000000..71fdd3b72 Binary files /dev/null and b/classfbgemm__gpu_1_1_generic_packed_tensor_accessor.png differ diff --git a/classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4-members.html b/classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4-members.html new file mode 100644 index 000000000..a6de37d61 --- /dev/null +++ b/classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4-members.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+
GenericPackedTensorAccessor< T, 1, PtrTraits, index_t > Member List
+
+
+ +

This is the complete list of members for GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >, including all inherited members.

+ + + + + + + + + + + + + + + + + + + + + + +
at(index_t idx) constGenericPackedTensorAccessorBase< T, 1, PtrTraits, index_t >inline
bounds_check_(index_t i) constGenericPackedTensorAccessorBase< T, 1, PtrTraits, index_t >inlineprotected
copy_str(char *dst, const char *src, const size_t max_len)GenericPackedTensorAccessorBase< T, 1, PtrTraits, index_t >inline
data()GenericPackedTensorAccessorBase< T, 1, PtrTraits, index_t >inline
data() constGenericPackedTensorAccessorBase< T, 1, PtrTraits, index_t >inline
data_GenericPackedTensorAccessorBase< T, 1, PtrTraits, index_t >protected
func_name_GenericPackedTensorAccessorBase< T, 1, PtrTraits, index_t >protected
GenericPackedTensorAccessor(PtrType data, const index_t *const sizes, const index_t *const strides, const char *const ptr_name, const char *const func_name)GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >inline
GenericPackedTensorAccessor(PtrType data, const source_index_t *const sizes, const source_index_t *const strides, const char *const ptr_name, const char *const func_name)GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >inline
GenericPackedTensorAccessorBase(PtrType data, const index_t *const sizes, const index_t *const strides, const char *const ptr_name, const char *const func_name)GenericPackedTensorAccessorBase< T, 1, PtrTraits, index_t >inline
GenericPackedTensorAccessorBase(PtrType data, const source_index_t *const sizes, const source_index_t *const strides, const char *const ptr_name, const char *const func_name)GenericPackedTensorAccessorBase< T, 1, PtrTraits, index_t >inline
numel_GenericPackedTensorAccessorBase< T, 1, PtrTraits, index_t >protected
operator[](index_t i)GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >inline
operator[](index_t i) constGenericPackedTensorAccessor< T, 1, PtrTraits, index_t >inline
ptr_name_GenericPackedTensorAccessorBase< T, 1, PtrTraits, index_t >protected
PtrType typedefGenericPackedTensorAccessor< T, 1, PtrTraits, index_t >
size(index_t i) constGenericPackedTensorAccessorBase< T, 1, PtrTraits, index_t >inline
sizes_GenericPackedTensorAccessorBase< T, 1, PtrTraits, index_t >protected
stride(index_t i) constGenericPackedTensorAccessorBase< T, 1, PtrTraits, index_t >inline
strides_GenericPackedTensorAccessorBase< T, 1, PtrTraits, index_t >protected
transpose(index_t dim1, index_t dim2) constGenericPackedTensorAccessor< T, 1, PtrTraits, index_t >inline
+ + + + diff --git a/classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html b/classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html new file mode 100644 index 000000000..9dc85f502 --- /dev/null +++ b/classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html @@ -0,0 +1,301 @@ + + + + + + + +fbgemm_gpu: GenericPackedTensorAccessor< T, 1, PtrTraits, index_t > Class Template Reference + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+ +
GenericPackedTensorAccessor< T, 1, PtrTraits, index_t > Class Template Reference
+
+
+ +

#include <fbgemm_tensor_accessor.h>

+
+Inheritance diagram for GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >:
+
+
+ + +GenericPackedTensorAccessorBase< T, 1, PtrTraits, index_t > + +
+

Member Typedef Documentation

+ +

◆ PtrType

+ +
+
+
+template<typename T , template< typename U > class PtrTraits, typename index_t >
+ + + + +
typedef PtrTraits<T>::PtrType PtrType
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ GenericPackedTensorAccessor() [1/2]

+ +
+
+
+template<typename T , template< typename U > class PtrTraits, typename index_t >
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
C10_HOST GenericPackedTensorAccessor (PtrType data,
const index_t *const sizes,
const index_t *const strides,
const char *const ptr_name,
const char *const func_name )
+
+inline
+
+ +
+
+ +

◆ GenericPackedTensorAccessor() [2/2]

+ +
+
+
+template<typename T , template< typename U > class PtrTraits, typename index_t >
+
+template<typename source_index_t , class = typename std::enable_if< std::is_same<source_index_t, int64_t>::value>::type>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
C10_HOST GenericPackedTensorAccessor (PtrType data,
const source_index_t *const sizes,
const source_index_t *const strides,
const char *const ptr_name,
const char *const func_name )
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ operator[]() [1/2]

+ +
+
+
+template<typename T , template< typename U > class PtrTraits, typename index_t >
+ + + + + +
+ + + + + + + +
C10_DEVICE T & operator[] (index_t i)
+
+inline
+
+ +
+
+ +

◆ operator[]() [2/2]

+ +
+
+
+template<typename T , template< typename U > class PtrTraits, typename index_t >
+ + + + + +
+ + + + + + + +
C10_DEVICE const T & operator[] (index_t i) const
+
+inline
+
+ +
+
+ +

◆ transpose()

+ +
+
+
+template<typename T , template< typename U > class PtrTraits, typename index_t >
+ + + + + +
+ + + + + + + + + + + +
C10_HOST GenericPackedTensorAccessor< T, 1, PtrTraits, index_t > transpose (index_t dim1,
index_t dim2 ) const
+
+inline
+
+ +
+
+
The documentation for this class was generated from the following file: +
+ + + + diff --git a/classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.png b/classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.png new file mode 100644 index 000000000..8940c1c33 Binary files /dev/null and b/classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.png differ diff --git a/classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base-members.html b/classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base-members.html new file mode 100644 index 000000000..41c5c7a91 --- /dev/null +++ b/classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base-members.html @@ -0,0 +1,106 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+
GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t > Member List
+
+
+ +

This is the complete list of members for GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >, including all inherited members.

+ + + + + + + + + + + + + + + + + +
at(index_t idx) constGenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >inline
bounds_check_(index_t i) constGenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >inlineprotected
copy_str(char *dst, const char *src, const size_t max_len)GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >inline
data()GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >inline
data() constGenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >inline
data_GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >protected
func_name_GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >protected
GenericPackedTensorAccessorBase(PtrType data, const index_t *const sizes, const index_t *const strides, const char *const ptr_name, const char *const func_name)GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >inline
GenericPackedTensorAccessorBase(PtrType data, const source_index_t *const sizes, const source_index_t *const strides, const char *const ptr_name, const char *const func_name)GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >inline
numel_GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >protected
ptr_name_GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >protected
PtrType typedefGenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >
size(index_t i) constGenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >inline
sizes_GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >protected
stride(index_t i) constGenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >inline
strides_GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >protected
+ + + + diff --git a/classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html b/classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html new file mode 100644 index 000000000..1e1ddba42 --- /dev/null +++ b/classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html @@ -0,0 +1,559 @@ + + + + + + + +fbgemm_gpu: GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t > Class Template Reference + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+ +
GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t > Class Template Reference
+
+
+ +

#include <fbgemm_tensor_accessor.h>

+
+Inheritance diagram for GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >:
+
+
+ + +GenericPackedTensorAccessor< T, N, PtrTraits, index_t > + +
+

Member Typedef Documentation

+ +

◆ PtrType

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + +
typedef PtrTraits<T>::PtrType PtrType
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ GenericPackedTensorAccessorBase() [1/2]

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
C10_HOST GenericPackedTensorAccessorBase (PtrType data,
const index_t *const sizes,
const index_t *const strides,
const char *const ptr_name,
const char *const func_name )
+
+inline
+
+ +
+
+ +

◆ GenericPackedTensorAccessorBase() [2/2]

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+
+template<typename source_index_t , class = typename std::enable_if< std::is_same<source_index_t, int64_t>::value>::type>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
C10_HOST GenericPackedTensorAccessorBase (PtrType data,
const source_index_t *const sizes,
const source_index_t *const strides,
const char *const ptr_name,
const char *const func_name )
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ at()

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + +
C10_HOST_DEVICE T & at (index_t idx) const
+
+inline
+
+ +
+
+ +

◆ bounds_check_()

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + +
C10_HOST void bounds_check_ (index_t i) const
+
+inlineprotected
+
+ +
+
+ +

◆ copy_str()

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + + + + + + + + + + +
C10_HOST void copy_str (char * dst,
const char * src,
const size_t max_len )
+
+inline
+
+ +
+
+ +

◆ data() [1/2]

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + +
C10_HOST_DEVICE PtrType data ()
+
+inline
+
+ +
+
+ +

◆ data() [2/2]

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + +
C10_HOST_DEVICE const PtrType data () const
+
+inline
+
+ +
+
+ +

◆ size()

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + +
C10_HOST_DEVICE index_t size (index_t i) const
+
+inline
+
+ +
+
+ +

◆ stride()

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + +
C10_HOST_DEVICE index_t stride (index_t i) const
+
+inline
+
+ +
+
+

Member Data Documentation

+ +

◆ data_

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + +
PtrType data_
+
+protected
+
+ +
+
+ +

◆ func_name_

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + +
char func_name_[FUNC_NAME_MAX_LEN]
+
+protected
+
+ +
+
+ +

◆ numel_

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + +
index_t numel_
+
+protected
+
+ +
+
+ +

◆ ptr_name_

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + +
char ptr_name_[PTR_NAME_MAX_LEN]
+
+protected
+
+ +
+
+ +

◆ sizes_

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + +
index_t sizes_[N]
+
+protected
+
+ +
+
+ +

◆ strides_

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + +
index_t strides_[N]
+
+protected
+
+ +
+
+
The documentation for this class was generated from the following file: +
+ + + + diff --git a/classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.png b/classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.png new file mode 100644 index 000000000..98add50b8 Binary files /dev/null and b/classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.png differ diff --git a/classfbgemm__gpu_1_1_permute_pooled_embs_function-members.html b/classfbgemm__gpu_1_1_permute_pooled_embs_function-members.html new file mode 100644 index 000000000..d413a41e2 --- /dev/null +++ b/classfbgemm__gpu_1_1_permute_pooled_embs_function-members.html @@ -0,0 +1,92 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+
PermutePooledEmbsFunction Member List
+
+
+ +

This is the complete list of members for PermutePooledEmbsFunction, including all inherited members.

+ + + +
backward(AutogradContext *ctx, variable_list grad_output)PermutePooledEmbsFunctionstatic
forward(AutogradContext *ctx, const at::Tensor &pooled_embs, const at::Tensor &offset_dim_list, const at::Tensor &permute_list, const at::Tensor &inv_offset_dim_list, const at::Tensor &inv_permute_list, const bool &allow_duplicates=false)PermutePooledEmbsFunctionstatic
+ + + + diff --git a/classfbgemm__gpu_1_1_permute_pooled_embs_function.html b/classfbgemm__gpu_1_1_permute_pooled_embs_function.html new file mode 100644 index 000000000..0de750353 --- /dev/null +++ b/classfbgemm__gpu_1_1_permute_pooled_embs_function.html @@ -0,0 +1,185 @@ + + + + + + + +fbgemm_gpu: PermutePooledEmbsFunction Class Reference + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+ +
PermutePooledEmbsFunction Class Reference
+
+
+ +

#include <permute_pooled_embedding_ops.h>

+
+Inheritance diagram for PermutePooledEmbsFunction:
+
+
+ +
+

Member Function Documentation

+ +

◆ backward()

+ +
+
+ + + + + +
+ + + + + + + + + + + +
variable_list backward (AutogradContext * ctx,
variable_list grad_output )
+
+static
+
+ +
+
+ +

◆ forward()

+ +
+
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Variable forward (AutogradContext * ctx,
const at::Tensor & pooled_embs,
const at::Tensor & offset_dim_list,
const at::Tensor & permute_list,
const at::Tensor & inv_offset_dim_list,
const at::Tensor & inv_permute_list,
const bool & allow_duplicates = false )
+
+static
+
+ +
+
+
The documentation for this class was generated from the following files: +
+ + + + diff --git a/classfbgemm__gpu_1_1_permute_pooled_embs_function.png b/classfbgemm__gpu_1_1_permute_pooled_embs_function.png new file mode 100644 index 000000000..ebef54aa5 Binary files /dev/null and b/classfbgemm__gpu_1_1_permute_pooled_embs_function.png differ diff --git a/classfbgemm__gpu_1_1_permute_pooled_embs_function_split-members.html b/classfbgemm__gpu_1_1_permute_pooled_embs_function_split-members.html new file mode 100644 index 000000000..93626edbb --- /dev/null +++ b/classfbgemm__gpu_1_1_permute_pooled_embs_function_split-members.html @@ -0,0 +1,92 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+
PermutePooledEmbsFunctionSplit< permute_pooled_embs_op > Member List
+
+
+ +

This is the complete list of members for PermutePooledEmbsFunctionSplit< permute_pooled_embs_op >, including all inherited members.

+ + + +
backward(AutogradContext *ctx, variable_list grad_output)PermutePooledEmbsFunctionSplit< permute_pooled_embs_op >inlinestatic
forward(AutogradContext *ctx, const at::Tensor &pooled_embs, const at::Tensor &offset_dim_list, const at::Tensor &permute_list, const at::Tensor &inv_offset_dim_list, const at::Tensor &inv_permute_list)PermutePooledEmbsFunctionSplit< permute_pooled_embs_op >inlinestatic
+ + + + diff --git a/classfbgemm__gpu_1_1_permute_pooled_embs_function_split.html b/classfbgemm__gpu_1_1_permute_pooled_embs_function_split.html new file mode 100644 index 000000000..daf499383 --- /dev/null +++ b/classfbgemm__gpu_1_1_permute_pooled_embs_function_split.html @@ -0,0 +1,183 @@ + + + + + + + +fbgemm_gpu: PermutePooledEmbsFunctionSplit< permute_pooled_embs_op > Class Template Reference + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+ +
PermutePooledEmbsFunctionSplit< permute_pooled_embs_op > Class Template Reference
+
+
+ +

#include <permute_pooled_embs_function_split.h>

+
+Inheritance diagram for PermutePooledEmbsFunctionSplit< permute_pooled_embs_op >:
+
+
+ +
+

Member Function Documentation

+ +

◆ backward()

+ +
+
+
+template<torch::autograd::Variable(*)(const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &) permute_pooled_embs_op>
+ + + + + +
+ + + + + + + + + + + +
static variable_list backward (AutogradContext * ctx,
variable_list grad_output )
+
+inlinestatic
+
+ +
+
+ +

◆ forward()

+ +
+
+
+template<torch::autograd::Variable(*)(const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &, const at::Tensor &) permute_pooled_embs_op>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
static Variable forward (AutogradContext * ctx,
const at::Tensor & pooled_embs,
const at::Tensor & offset_dim_list,
const at::Tensor & permute_list,
const at::Tensor & inv_offset_dim_list,
const at::Tensor & inv_permute_list )
+
+inlinestatic
+
+ +
+
+
The documentation for this class was generated from the following file: +
+ + + + diff --git a/classfbgemm__gpu_1_1_permute_pooled_embs_function_split.png b/classfbgemm__gpu_1_1_permute_pooled_embs_function_split.png new file mode 100644 index 000000000..2ca03427c Binary files /dev/null and b/classfbgemm__gpu_1_1_permute_pooled_embs_function_split.png differ diff --git a/classfbgemm__gpu_1_1_tensor_accessor-members.html b/classfbgemm__gpu_1_1_tensor_accessor-members.html new file mode 100644 index 000000000..4bc6a587d --- /dev/null +++ b/classfbgemm__gpu_1_1_tensor_accessor-members.html @@ -0,0 +1,108 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+
TensorAccessor< T, N, PtrTraits, index_t > Member List
+
+
+ +

This is the complete list of members for TensorAccessor< T, N, PtrTraits, index_t >, including all inherited members.

+ + + + + + + + + + + + + + + + + + + +
at(index_t idx) constTensorAccessorBase< T, N, PtrTraits, index_t >inline
data()TensorAccessorBase< T, N, PtrTraits, index_t >inline
data() constTensorAccessorBase< T, N, PtrTraits, index_t >inline
data_TensorAccessorBase< T, N, PtrTraits, index_t >protected
func_name_TensorAccessorBase< T, N, PtrTraits, index_t >protected
numel_TensorAccessorBase< T, N, PtrTraits, index_t >protected
operator[](index_t i)TensorAccessor< T, N, PtrTraits, index_t >inline
operator[](index_t i) constTensorAccessor< T, N, PtrTraits, index_t >inline
ptr_name_TensorAccessorBase< T, N, PtrTraits, index_t >protected
PtrType typedefTensorAccessor< T, N, PtrTraits, index_t >
size(index_t i) constTensorAccessorBase< T, N, PtrTraits, index_t >inline
sizes() constTensorAccessorBase< T, N, PtrTraits, index_t >inline
sizes_TensorAccessorBase< T, N, PtrTraits, index_t >protected
stride(index_t i) constTensorAccessorBase< T, N, PtrTraits, index_t >inline
strides() constTensorAccessorBase< T, N, PtrTraits, index_t >inline
strides_TensorAccessorBase< T, N, PtrTraits, index_t >protected
TensorAccessor(PtrType data, const index_t *const sizes, const index_t *const strides, const char *const ptr_name, const char *const func_name)TensorAccessor< T, N, PtrTraits, index_t >inline
TensorAccessorBase(PtrType data, const index_t *const sizes, const index_t *const strides, const char *const ptr_name, const char *const func_name)TensorAccessorBase< T, N, PtrTraits, index_t >inline
+ + + + diff --git a/classfbgemm__gpu_1_1_tensor_accessor.html b/classfbgemm__gpu_1_1_tensor_accessor.html new file mode 100644 index 000000000..cf295edcf --- /dev/null +++ b/classfbgemm__gpu_1_1_tensor_accessor.html @@ -0,0 +1,222 @@ + + + + + + + +fbgemm_gpu: TensorAccessor< T, N, PtrTraits, index_t > Class Template Reference + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+ +
TensorAccessor< T, N, PtrTraits, index_t > Class Template Reference
+
+
+ +

#include <fbgemm_tensor_accessor.h>

+
+Inheritance diagram for TensorAccessor< T, N, PtrTraits, index_t >:
+
+
+ + +TensorAccessorBase< T, N, PtrTraits, index_t > + +
+

Member Typedef Documentation

+ +

◆ PtrType

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + +
typedef PtrTraits<T>::PtrType PtrType
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ TensorAccessor()

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
C10_HOST_DEVICE TensorAccessor (PtrType data,
const index_t *const sizes,
const index_t *const strides,
const char *const ptr_name,
const char *const func_name )
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ operator[]() [1/2]

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + +
C10_HOST_DEVICE TensorAccessor< T, N - 1, PtrTraits, index_t > operator[] (index_t i)
+
+inline
+
+ +
+
+ +

◆ operator[]() [2/2]

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + +
C10_HOST_DEVICE const TensorAccessor< T, N - 1, PtrTraits, index_t > operator[] (index_t i) const
+
+inline
+
+ +
+
+
The documentation for this class was generated from the following file: +
+ + + + diff --git a/classfbgemm__gpu_1_1_tensor_accessor.png b/classfbgemm__gpu_1_1_tensor_accessor.png new file mode 100644 index 000000000..c8fd4dd93 Binary files /dev/null and b/classfbgemm__gpu_1_1_tensor_accessor.png differ diff --git a/classfbgemm__gpu_1_1_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4-members.html b/classfbgemm__gpu_1_1_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4-members.html new file mode 100644 index 000000000..27c40fb2d --- /dev/null +++ b/classfbgemm__gpu_1_1_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4-members.html @@ -0,0 +1,108 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+
TensorAccessor< T, 1, PtrTraits, index_t > Member List
+
+
+ +

This is the complete list of members for TensorAccessor< T, 1, PtrTraits, index_t >, including all inherited members.

+ + + + + + + + + + + + + + + + + + + +
at(index_t idx) constTensorAccessorBase< T, 1, PtrTraits, index_t >inline
data()TensorAccessorBase< T, 1, PtrTraits, index_t >inline
data() constTensorAccessorBase< T, 1, PtrTraits, index_t >inline
data_TensorAccessorBase< T, 1, PtrTraits, index_t >protected
func_name_TensorAccessorBase< T, 1, PtrTraits, index_t >protected
numel_TensorAccessorBase< T, 1, PtrTraits, index_t >protected
operator[](index_t i)TensorAccessor< T, 1, PtrTraits, index_t >inline
operator[](index_t i) constTensorAccessor< T, 1, PtrTraits, index_t >inline
ptr_name_TensorAccessorBase< T, 1, PtrTraits, index_t >protected
PtrType typedefTensorAccessor< T, 1, PtrTraits, index_t >
size(index_t i) constTensorAccessorBase< T, 1, PtrTraits, index_t >inline
sizes() constTensorAccessorBase< T, 1, PtrTraits, index_t >inline
sizes_TensorAccessorBase< T, 1, PtrTraits, index_t >protected
stride(index_t i) constTensorAccessorBase< T, 1, PtrTraits, index_t >inline
strides() constTensorAccessorBase< T, 1, PtrTraits, index_t >inline
strides_TensorAccessorBase< T, 1, PtrTraits, index_t >protected
TensorAccessor(PtrType data, const index_t *const sizes, const index_t *const strides, const char *const ptr_name, const char *func_name)TensorAccessor< T, 1, PtrTraits, index_t >inline
TensorAccessorBase(PtrType data, const index_t *const sizes, const index_t *const strides, const char *const ptr_name, const char *const func_name)TensorAccessorBase< T, 1, PtrTraits, index_t >inline
+ + + + diff --git a/classfbgemm__gpu_1_1_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html b/classfbgemm__gpu_1_1_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html new file mode 100644 index 000000000..0d9da0746 --- /dev/null +++ b/classfbgemm__gpu_1_1_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html @@ -0,0 +1,222 @@ + + + + + + + +fbgemm_gpu: TensorAccessor< T, 1, PtrTraits, index_t > Class Template Reference + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+ +
TensorAccessor< T, 1, PtrTraits, index_t > Class Template Reference
+
+
+ +

#include <fbgemm_tensor_accessor.h>

+
+Inheritance diagram for TensorAccessor< T, 1, PtrTraits, index_t >:
+
+
+ + +TensorAccessorBase< T, 1, PtrTraits, index_t > + +
+

Member Typedef Documentation

+ +

◆ PtrType

+ +
+
+
+template<typename T , template< typename U > class PtrTraits, typename index_t >
+ + + + +
typedef PtrTraits<T>::PtrType PtrType
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ TensorAccessor()

+ +
+
+
+template<typename T , template< typename U > class PtrTraits, typename index_t >
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
C10_HOST_DEVICE TensorAccessor (PtrType data,
const index_t *const sizes,
const index_t *const strides,
const char *const ptr_name,
const char * func_name )
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ operator[]() [1/2]

+ +
+
+
+template<typename T , template< typename U > class PtrTraits, typename index_t >
+ + + + + +
+ + + + + + + +
C10_HOST_DEVICE T & operator[] (index_t i)
+
+inline
+
+ +
+
+ +

◆ operator[]() [2/2]

+ +
+
+
+template<typename T , template< typename U > class PtrTraits, typename index_t >
+ + + + + +
+ + + + + + + +
C10_HOST_DEVICE const T & operator[] (index_t i) const
+
+inline
+
+ +
+
+
The documentation for this class was generated from the following file: +
+ + + + diff --git a/classfbgemm__gpu_1_1_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.png b/classfbgemm__gpu_1_1_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.png new file mode 100644 index 000000000..bbf840bcf Binary files /dev/null and b/classfbgemm__gpu_1_1_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.png differ diff --git a/classfbgemm__gpu_1_1_tensor_accessor_base-members.html b/classfbgemm__gpu_1_1_tensor_accessor_base-members.html new file mode 100644 index 000000000..2c3f588fb --- /dev/null +++ b/classfbgemm__gpu_1_1_tensor_accessor_base-members.html @@ -0,0 +1,105 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+
TensorAccessorBase< T, N, PtrTraits, index_t > Member List
+
+ + + + + diff --git a/classfbgemm__gpu_1_1_tensor_accessor_base.html b/classfbgemm__gpu_1_1_tensor_accessor_base.html new file mode 100644 index 000000000..2c96d0695 --- /dev/null +++ b/classfbgemm__gpu_1_1_tensor_accessor_base.html @@ -0,0 +1,502 @@ + + + + + + + +fbgemm_gpu: TensorAccessorBase< T, N, PtrTraits, index_t > Class Template Reference + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+ +
TensorAccessorBase< T, N, PtrTraits, index_t > Class Template Reference
+
+
+ +

#include <fbgemm_tensor_accessor.h>

+
+Inheritance diagram for TensorAccessorBase< T, N, PtrTraits, index_t >:
+
+
+ + +TensorAccessor< T, N, PtrTraits, index_t > + +
+

Member Typedef Documentation

+ +

◆ PtrType

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + +
typedef PtrTraits<T>::PtrType PtrType
+
+ +
+
+

Constructor & Destructor Documentation

+ +

◆ TensorAccessorBase()

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
C10_HOST_DEVICE TensorAccessorBase (PtrType data,
const index_t *const sizes,
const index_t *const strides,
const char *const ptr_name,
const char *const func_name )
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ at()

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + +
C10_HOST_DEVICE T & at (index_t idx) const
+
+inline
+
+ +
+
+ +

◆ data() [1/2]

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + +
C10_HOST_DEVICE PtrType data ()
+
+inline
+
+ +
+
+ +

◆ data() [2/2]

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + +
C10_HOST_DEVICE const PtrType data () const
+
+inline
+
+ +
+
+ +

◆ size()

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + +
C10_HOST_DEVICE index_t size (index_t i) const
+
+inline
+
+ +
+
+ +

◆ sizes()

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + +
C10_HOST at::IntArrayRef sizes () const
+
+inline
+
+ +
+
+ +

◆ stride()

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + +
C10_HOST_DEVICE index_t stride (index_t i) const
+
+inline
+
+ +
+
+ +

◆ strides()

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + + + + +
C10_HOST at::IntArrayRef strides () const
+
+inline
+
+ +
+
+

Member Data Documentation

+ +

◆ data_

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + +
PtrType data_
+
+protected
+
+ +
+
+ +

◆ func_name_

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + +
const char* const func_name_
+
+protected
+
+ +
+
+ +

◆ numel_

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + +
index_t numel_
+
+protected
+
+ +
+
+ +

◆ ptr_name_

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + +
const char* const ptr_name_
+
+protected
+
+ +
+
+ +

◆ sizes_

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + +
const index_t* const sizes_
+
+protected
+
+ +
+
+ +

◆ strides_

+ +
+
+
+template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
+ + + + + +
+ + + + +
const index_t* const strides_
+
+protected
+
+ +
+
+
The documentation for this class was generated from the following file: +
+ + + + diff --git a/classfbgemm__gpu_1_1_tensor_accessor_base.png b/classfbgemm__gpu_1_1_tensor_accessor_base.png new file mode 100644 index 000000000..973040bb8 Binary files /dev/null and b/classfbgemm__gpu_1_1_tensor_accessor_base.png differ diff --git a/classfbgemm__gpu_1_1enum__registration-members.html b/classfbgemm__gpu_1_1enum__registration-members.html new file mode 100644 index 000000000..5c684d988 --- /dev/null +++ b/classfbgemm__gpu_1_1enum__registration-members.html @@ -0,0 +1,96 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+
enum_registration< T > Member List
+
+
+ +

This is the complete list of members for enum_registration< T >, including all inherited members.

+ + + + + + + +
enum_query()enum_registration< T >inlinestatic
enum_registration(const char *enum_name, enum_items &&items)enum_registration< T >inline
items_enum_registration< T >protected
name_enum_registration< T >protected
next_enum_registration< T >protected
registration_listenum_registration< T >protectedstatic
+ + + + diff --git a/classfbgemm__gpu_1_1enum__registration.html b/classfbgemm__gpu_1_1enum__registration.html new file mode 100644 index 000000000..356fe18e1 --- /dev/null +++ b/classfbgemm__gpu_1_1enum__registration.html @@ -0,0 +1,251 @@ + + + + + + + +fbgemm_gpu: enum_registration< T > Class Template Reference + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+ +
enum_registration< T > Class Template Reference
+
+
+ +

#include <enum_utils.h>

+

Constructor & Destructor Documentation

+ +

◆ enum_registration()

+ +
+
+
+template<class T >
+ + + + + +
+ + + + + + + + + + + +
enum_registration (const char * enum_name,
enum_items && items )
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ enum_query()

+ +
+
+
+template<class T >
+ + + + + +
+ + + + + + + +
static enum_result enum_query ()
+
+inlinestatic
+
+ +
+
+

Member Data Documentation

+ +

◆ items_

+ +
+
+
+template<class T >
+ + + + + +
+ + + + +
std::vector<enum_item> items_
+
+protected
+
+ +
+
+ +

◆ name_

+ +
+
+
+template<class T >
+ + + + + +
+ + + + +
const char* name_
+
+protected
+
+ +
+
+ +

◆ next_

+ +
+
+
+template<class T >
+ + + + + +
+ + + + +
enum_registration<T>* next_
+
+protected
+
+ +
+
+ +

◆ registration_list

+ +
+
+
+template<class T >
+ + + + + +
+ + + + +
enum_registration<T>* registration_list
+
+staticprotected
+
+ +
+
+
The documentation for this class was generated from the following file:
    +
  • /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/enum_utils.h
  • +
+
+ + + + diff --git a/classssd_1_1_embedding_rocks_d_b-members.html b/classssd_1_1_embedding_rocks_d_b-members.html new file mode 100644 index 000000000..02b3795b3 --- /dev/null +++ b/classssd_1_1_embedding_rocks_d_b-members.html @@ -0,0 +1,99 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+
EmbeddingRocksDB Member List
+
+
+ +

This is the complete list of members for EmbeddingRocksDB, including all inherited members.

+ + + + + + + + + + +
compact()EmbeddingRocksDBinline
compact_if_necessary(int64_t timestep)EmbeddingRocksDBinline
EmbeddingRocksDB(std::string path, int64_t num_shards, int64_t num_threads, int64_t memtable_flush_period, int64_t memtable_flush_offset, int64_t l0_files_per_compact, int64_t max_D, int64_t rate_limit_mbps, int64_t size_ratio, int64_t compaction_trigger, int64_t write_buffer_size, int64_t max_write_buffer_num, float uniform_init_lower, float uniform_init_upper, int64_t row_storage_bitwidth=32)EmbeddingRocksDBinline
flush()EmbeddingRocksDBinline
flush_if_necessary(int64_t timestep)EmbeddingRocksDBinline
get(Tensor indices, Tensor weights, Tensor count)EmbeddingRocksDBinline
get_cuda(Tensor indices, Tensor weights, Tensor count)EmbeddingRocksDBinline
set(Tensor indices, Tensor weights, Tensor count)EmbeddingRocksDBinline
set_cuda(Tensor indices, Tensor weights, Tensor count, int64_t timestep)EmbeddingRocksDBinline
+ + + + diff --git a/classssd_1_1_embedding_rocks_d_b.html b/classssd_1_1_embedding_rocks_d_b.html new file mode 100644 index 000000000..6bc271ef2 --- /dev/null +++ b/classssd_1_1_embedding_rocks_d_b.html @@ -0,0 +1,437 @@ + + + + + + + +fbgemm_gpu: EmbeddingRocksDB Class Reference + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+ +
EmbeddingRocksDB Class Reference
+
+
+ +

#include <ssd_table_batched_embeddings.h>

+
+Inheritance diagram for EmbeddingRocksDB:
+
+
+ +
+

Constructor & Destructor Documentation

+ +

◆ EmbeddingRocksDB()

+ +
+
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
EmbeddingRocksDB (std::string path,
int64_t num_shards,
int64_t num_threads,
int64_t memtable_flush_period,
int64_t memtable_flush_offset,
int64_t l0_files_per_compact,
int64_t max_D,
int64_t rate_limit_mbps,
int64_t size_ratio,
int64_t compaction_trigger,
int64_t write_buffer_size,
int64_t max_write_buffer_num,
float uniform_init_lower,
float uniform_init_upper,
int64_t row_storage_bitwidth = 32 )
+
+inline
+
+ +
+
+

Member Function Documentation

+ +

◆ compact()

+ +
+
+ + + + + +
+ + + + + + + +
void compact ()
+
+inline
+
+ +
+
+ +

◆ compact_if_necessary()

+ +
+
+ + + + + +
+ + + + + + + +
void compact_if_necessary (int64_t timestep)
+
+inline
+
+ +
+
+ +

◆ flush()

+ +
+
+ + + + + +
+ + + + + + + +
void flush ()
+
+inline
+
+ +
+
+ +

◆ flush_if_necessary()

+ +
+
+ + + + + +
+ + + + + + + +
void flush_if_necessary (int64_t timestep)
+
+inline
+
+ +
+
+ +

◆ get()

+ +
+
+ + + + + +
+ + + + + + + + + + + + + + + + +
void get (Tensor indices,
Tensor weights,
Tensor count )
+
+inline
+
+ +
+
+ +

◆ get_cuda()

+ +
+
+ + + + + +
+ + + + + + + + + + + + + + + + +
void get_cuda (Tensor indices,
Tensor weights,
Tensor count )
+
+inline
+
+ +
+
+ +

◆ set()

+ +
+
+ + + + + +
+ + + + + + + + + + + + + + + + +
void set (Tensor indices,
Tensor weights,
Tensor count )
+
+inline
+
+ +
+
+ +

◆ set_cuda()

+ +
+
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + +
void set_cuda (Tensor indices,
Tensor weights,
Tensor count,
int64_t timestep )
+
+inline
+
+ +
+
+
The documentation for this class was generated from the following file: +
+ + + + diff --git a/classssd_1_1_embedding_rocks_d_b.png b/classssd_1_1_embedding_rocks_d_b.png new file mode 100644 index 000000000..b2d676a15 Binary files /dev/null and b/classssd_1_1_embedding_rocks_d_b.png differ diff --git a/classssd_1_1_initializer-members.html b/classssd_1_1_initializer-members.html new file mode 100644 index 000000000..2434d2cd6 --- /dev/null +++ b/classssd_1_1_initializer-members.html @@ -0,0 +1,97 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+
Initializer Member List
+
+
+ +

This is the complete list of members for Initializer, including all inherited members.

+ + + + + + + + +
consumer_queue_Initializer
Initializer(uint64_t random_seed, int64_t max_D, float uniform_init_lower, float uniform_init_upper, int64_t row_storage_bitwidth=32)Initializerinline
producer_Initializer
producer_queue_Initializer
row_storage_Initializer
stop_Initializer
~Initializer()Initializerinline
+ + + + diff --git a/classssd_1_1_initializer.html b/classssd_1_1_initializer.html new file mode 100644 index 000000000..15af1c82f --- /dev/null +++ b/classssd_1_1_initializer.html @@ -0,0 +1,235 @@ + + + + + + + +fbgemm_gpu: Initializer Class Reference + + + + + + + + + + + +
+
+ + + + + + +
+
fbgemm_gpu +
+
+
+ + + + + + + + +
+
+ + +
+
+
+
+
+
Loading...
+
Searching...
+
No Matches
+
+
+
+
+ + +
+
+ +
Initializer Class Reference
+
+
+ +

#include <ssd_table_batched_embeddings.h>

+

Constructor & Destructor Documentation

+ +

◆ Initializer()

+ +
+
+ + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Initializer (uint64_t random_seed,
int64_t max_D,
float uniform_init_lower,
float uniform_init_upper,
int64_t row_storage_bitwidth = 32 )
+
+inline
+
+ +
+
+ +

◆ ~Initializer()

+ +
+
+ + + + + +
+ + + + + + + +
~Initializer ()
+
+inline
+
+ +
+
+

Member Data Documentation

+ +

◆ consumer_queue_

+ +
+
+ + + + +
folly::USPSCQueue<int64_t, true> consumer_queue_
+
+ +
+
+ +

◆ producer_

+ +
+
+ + + + +
std::unique_ptr<std::thread> producer_
+
+ +
+
+ +

◆ producer_queue_

+ +
+
+ + + + +
folly::USPSCQueue<int64_t, true> producer_queue_
+
+ +
+
+ +

◆ row_storage_

+ +
+
+ + + + +
Tensor row_storage_
+
+ +
+
+ +

◆ stop_

+ +
+
+ + + + +
std::atomic<bool> stop_ {false}
+
+ +
+
+
The documentation for this class was generated from the following file: +
+ + + + diff --git a/clipboard.js b/clipboard.js new file mode 100644 index 000000000..42c1fb0e0 --- /dev/null +++ b/clipboard.js @@ -0,0 +1,61 @@ +/** + +The code below is based on the Doxygen Awesome project, see +https://github.com/jothepro/doxygen-awesome-css + +MIT License + +Copyright (c) 2021 - 2022 jothepro + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +let clipboard_title = "Copy to clipboard" +let clipboard_icon = `` +let clipboard_successIcon = `` +let clipboard_successDuration = 1000 + +$(function() { + if(navigator.clipboard) { + const fragments = document.getElementsByClassName("fragment") + for(const fragment of fragments) { + const clipboard_div = document.createElement("div") + clipboard_div.classList.add("clipboard") + clipboard_div.innerHTML = clipboard_icon + clipboard_div.title = clipboard_title + $(clipboard_div).click(function() { + const content = this.parentNode.cloneNode(true) + // filter out line number and folded fragments from file listings + content.querySelectorAll(".lineno, .ttc, .foldclosed").forEach((node) => { node.remove() }) + let text = content.textContent + // remove trailing newlines and trailing spaces from empty lines + text = text.replace(/^\s*\n/gm,'\n').replace(/\n*$/,'') + navigator.clipboard.writeText(text); + this.classList.add("success") + this.innerHTML = clipboard_successIcon + window.setTimeout(() => { // switch back to normal icon after timeout + this.classList.remove("success") + this.innerHTML = clipboard_icon + }, clipboard_successDuration); + }) + fragment.insertBefore(clipboard_div, fragment.firstChild) + } + } +}) diff --git a/cookie.js b/cookie.js new file mode 100644 index 000000000..53ad21d98 --- /dev/null +++ b/cookie.js @@ -0,0 +1,58 @@ +/*! + Cookie helper functions + Copyright (c) 2023 Dimitri van Heesch + Released under MIT license. +*/ +let Cookie = { + cookie_namespace: 'doxygen_', + + readSetting(cookie,defVal) { + if (window.chrome) { + const val = localStorage.getItem(this.cookie_namespace+cookie) || + sessionStorage.getItem(this.cookie_namespace+cookie); + if (val) return val; + } else { + let myCookie = this.cookie_namespace+cookie+"="; + if (document.cookie) { + const index = document.cookie.indexOf(myCookie); + if (index != -1) { + const valStart = index + myCookie.length; + let valEnd = document.cookie.indexOf(";", valStart); + if (valEnd == -1) { + valEnd = document.cookie.length; + } + return document.cookie.substring(valStart, valEnd); + } + } + } + return defVal; + }, + + writeSetting(cookie,val,days=10*365) { // default days='forever', 0=session cookie, -1=delete + if (window.chrome) { + if (days==0) { + sessionStorage.setItem(this.cookie_namespace+cookie,val); + } else { + localStorage.setItem(this.cookie_namespace+cookie,val); + } + } else { + let date = new Date(); + date.setTime(date.getTime()+(days*24*60*60*1000)); + const expiration = days!=0 ? "expires="+date.toGMTString()+";" : ""; + document.cookie = this.cookie_namespace + cookie + "=" + + val + "; SameSite=Lax;" + expiration + "path=/"; + } + }, + + eraseSetting(cookie) { + if (window.chrome) { + if (localStorage.getItem(this.cookie_namespace+cookie)) { + localStorage.removeItem(this.cookie_namespace+cookie); + } else if (sessionStorage.getItem(this.cookie_namespace+cookie)) { + sessionStorage.removeItem(this.cookie_namespace+cookie); + } + } else { + this.writeSetting(cookie,'',-1); + } + }, +} diff --git a/cpp-api/embedding_ops.html b/cpp-api/embedding_ops.html index 7d71396ca..026c289ae 100644 --- a/cpp-api/embedding_ops.html +++ b/cpp-api/embedding_ops.html @@ -6,7 +6,7 @@ - + @@ -28,6 +28,8 @@ + + @@ -251,18 +253,19 @@ -

FBGEMM_GPU General Info

+

FBGEMM_GPU General Info

-

FBGEMM_GPU Python API

+

FBGEMM_GPU Python API

-

FBGEMM_GPU C++ API

+

FBGEMM_GPU C++ API

  • Sparse Data Operators
  • Quantization Operators
  • @@ -350,61 +353,141 @@
    -

    Embedding Operators

    +

    Embedding Operators

    -

    CUDA Operators

    +

    CUDA Operators

    +
    +
    +Tensor split_embedding_codegen_lookup_adagrad_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps = 0, double learning_rate = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)
    +
    + +
    +
    +Tensor split_embedding_codegen_lookup_adam_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate = 0, double eps = 0, double beta1 = 0, double beta2 = 0, double weight_decay = 0, int64_t iter = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)
    +
    + +
    +
    +Tensor split_embedding_codegen_lookup_approx_rowwise_adagrad_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps = 0, double learning_rate = 0, double weight_decay = 0.0, int64_t weight_decay_mode = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)
    +
    + +
    +
    +Tensor split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_dev, Tensor prev_iter_uvm, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_dev, Tensor row_counter_uvm, Tensor row_counter_placements, Tensor row_counter_offsets, double eps = 0, double learning_rate = 0, double weight_decay = 0.0, int64_t iter = 0, int64_t counter_halflife = -1, int64_t adjustment_iter = -1, double adjustment_ub = 1.0, int64_t learning_rate_mode = -1, int64_t weight_decay_mode = 1, int64_t grad_sum_decay = -1, double max_counter = 0, double tail_id_threshold = 0.0, int64_t is_tail_id_thresh_ratio = 0, int64_t regularization_mode = 0, double weight_norm_coefficient = 0.0, double lower_bound = 0.0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)
    +
    + +
    +
    +Tensor split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps = 0, double learning_rate = 0, double weight_decay = 0.0, int64_t weight_decay_mode = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)
    +
    + +
    +
    +Tensor split_embedding_codegen_lookup_approx_sgd_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, double learning_rate = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)
    +
    + +
    +
    +Tensor split_embedding_codegen_lookup_lamb_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate = 0, double eps = 0, double beta1 = 0, double beta2 = 0, double weight_decay = 0, int64_t iter = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)
    +
    + +
    +
    +Tensor split_embedding_codegen_lookup_lars_sgd_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double learning_rate = 0, double eta = 0, double momentum = 0, double weight_decay = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)
    +
    + +
    +
    +Tensor split_embedding_codegen_lookup_none_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, int64_t total_hash_size = 0, int64_t total_unique_indices = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)
    +
    + +
    +
    +Tensor split_embedding_codegen_lookup_partial_rowwise_adam_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate = 0, double eps = 0, double beta1 = 0, double beta2 = 0, double weight_decay = 0, int64_t iter = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)
    +
    + +
    +
    +Tensor split_embedding_codegen_lookup_partial_rowwise_lamb_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate = 0, double eps = 0, double beta1 = 0, double beta2 = 0, double weight_decay = 0, int64_t iter = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)
    +
    + +
    +
    +Tensor split_embedding_codegen_lookup_rowwise_adagrad_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps = 0, double learning_rate = 0, double weight_decay = 0.0, int64_t weight_decay_mode = 0, double max_norm = 0.0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)
    +
    + +
    +
    +Tensor split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_dev, Tensor prev_iter_uvm, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_dev, Tensor row_counter_uvm, Tensor row_counter_placements, Tensor row_counter_offsets, double eps = 0, double learning_rate = 0, double weight_decay = 0.0, int64_t iter = 0, int64_t counter_halflife = -1, int64_t adjustment_iter = -1, double adjustment_ub = 1.0, int64_t learning_rate_mode = -1, int64_t weight_decay_mode = 1, int64_t grad_sum_decay = -1, double max_counter = 0, double tail_id_threshold = 0.0, int64_t is_tail_id_thresh_ratio = 0, int64_t regularization_mode = 0, double weight_norm_coefficient = 0.0, double lower_bound = 0.0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)
    +
    + +
    +
    +Tensor split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps = 0, double learning_rate = 0, double weight_decay = 0.0, int64_t weight_decay_mode = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)
    +
    + +
    +
    +Tensor split_embedding_codegen_lookup_rowwise_weighted_adagrad_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps = 0, double learning_rate = 0, double weight_decay = 0, int64_t iter = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)
    +
    + +
    +
    +Tensor split_embedding_codegen_lookup_sgd_function(const Tensor &placeholder_autograd_tensor, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const c10::optional<Tensor> &indice_weights, const c10::optional<Tensor> &feature_requires_grad, const Tensor &lxu_cache_locations, const bool gradient_clipping, const double max_gradient, const bool stochastic_rounding, double learning_rate = 0, const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32), const c10::optional<Tensor> &B_offsets = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_output_offsets_feature_rank = c10::optional<Tensor>(), const c10::optional<Tensor> &vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(), const int64_t max_B = -1, const int64_t max_B_feature_rank = -1, const int64_t vbe_output_size = -1, const bool is_experimental = false, const bool use_uniq_cache_locations_bwd = false, const bool use_homogeneous_placements = false)
    +
    +
    -
    -void bounds_check_indices_cuda(Tensor &rows_per_table, Tensor &indices, Tensor &offsets, int64_t bounds_check_mode, Tensor &warning, const c10::optional<Tensor> &weights, const c10::optional<Tensor> &B_ofsets, const int64_t max_B)
    +
    +void bounds_check_indices_cuda(Tensor &rows_per_table, Tensor &indices, Tensor &offsets, int64_t bounds_check_mode, Tensor &warning, const c10::optional<Tensor> &weights, const c10::optional<Tensor> &B_ofsets, const int64_t max_B)
    -
    -Tensor int_nbit_split_embedding_codegen_lookup_function(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t pooling_mode, c10::optional<Tensor> indice_weights, int64_t output_dtype, c10::optional<Tensor> lxu_cache_weights, c10::optional<Tensor> lxu_cache_locations, c10::optional<int64_t> row_alignment, c10::optional<int64_t> max_float8_D, c10::optional<int64_t> fp8_exponent_bits, c10::optional<int64_t> fp8_exponent_bias)
    +
    +Tensor int_nbit_split_embedding_codegen_lookup_function(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t pooling_mode, c10::optional<Tensor> indice_weights, int64_t output_dtype, c10::optional<Tensor> lxu_cache_weights, c10::optional<Tensor> lxu_cache_locations, c10::optional<int64_t> row_alignment, c10::optional<int64_t> max_float8_D, c10::optional<int64_t> fp8_exponent_bits, c10::optional<int64_t> fp8_exponent_bias)
    -
    -Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t pooling_mode, c10::optional<Tensor> indice_weights, int64_t output_dtype, c10::optional<Tensor> lxu_cache_weights, c10::optional<Tensor> lxu_cache_locations, c10::optional<int64_t> row_alignment, c10::optional<int64_t> max_float8_D, c10::optional<int64_t> fp8_exponent_bits, c10::optional<int64_t> fp8_exponent_bias, c10::optional<Tensor> cache_hash_size_cumsum, c10::optional<int64_t> total_cache_hash_size, c10::optional<Tensor> cache_index_table_map, c10::optional<Tensor> lxu_cache_state, c10::optional<Tensor> lxu_state)
    +
    +Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t pooling_mode, c10::optional<Tensor> indice_weights, int64_t output_dtype, c10::optional<Tensor> lxu_cache_weights, c10::optional<Tensor> lxu_cache_locations, c10::optional<int64_t> row_alignment, c10::optional<int64_t> max_float8_D, c10::optional<int64_t> fp8_exponent_bits, c10::optional<int64_t> fp8_exponent_bias, c10::optional<Tensor> cache_hash_size_cumsum, c10::optional<int64_t> total_cache_hash_size, c10::optional<Tensor> cache_index_table_map, c10::optional<Tensor> lxu_cache_state, c10::optional<Tensor> lxu_state)

    Simlar to int_nbit_split_embedding_codegen_lookup_function, but it does UVM_CACHING lookup.

    -
    -Tensor pruned_hashmap_lookup_cuda(Tensor indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets)
    +
    +Tensor pruned_hashmap_lookup_cuda(Tensor indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets)
    -
    -Tensor pruned_array_lookup_cuda(Tensor indices, Tensor offsets, Tensor index_remappings, Tensor index_remappings_offsets)
    +
    +Tensor pruned_array_lookup_cuda(Tensor indices, Tensor offsets, Tensor index_remappings, Tensor index_remappings_offsets)
    -

    CPU Operators

    +

    CPU Operators

    -
    -Tensor int_nbit_split_embedding_codegen_lookup_function_cpu(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t pooling_mode, c10::optional<Tensor> indice_weights, int64_t output_dtype, c10::optional<Tensor> lxu_cache_weights, c10::optional<Tensor> lxu_cache_locations, c10::optional<int64_t> row_alignment, c10::optional<int64_t> max_float8_D, c10::optional<int64_t> fp8_exponent_bits, c10::optional<int64_t> fp8_exponent_bias)
    +
    +Tensor int_nbit_split_embedding_codegen_lookup_function_cpu(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t pooling_mode, c10::optional<Tensor> indice_weights, int64_t output_dtype, c10::optional<Tensor> lxu_cache_weights, c10::optional<Tensor> lxu_cache_locations, c10::optional<int64_t> row_alignment, c10::optional<int64_t> max_float8_D, c10::optional<int64_t> fp8_exponent_bits, c10::optional<int64_t> fp8_exponent_bias)
    -
    -Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t pooling_mode, c10::optional<Tensor> indice_weights, int64_t output_dtype, c10::optional<Tensor> lxu_cache_weights, c10::optional<Tensor> lxu_cache_locations, c10::optional<int64_t> row_alignment, c10::optional<int64_t> max_float8_D, c10::optional<int64_t> fp8_exponent_bits, c10::optional<int64_t> fp8_exponent_bias, c10::optional<Tensor> cache_hash_size_cumsum, c10::optional<int64_t> total_cache_hash_size, c10::optional<Tensor> cache_index_table_map, c10::optional<Tensor> lxu_cache_state, c10::optional<Tensor> lxu_state)
    +
    +Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t pooling_mode, c10::optional<Tensor> indice_weights, int64_t output_dtype, c10::optional<Tensor> lxu_cache_weights, c10::optional<Tensor> lxu_cache_locations, c10::optional<int64_t> row_alignment, c10::optional<int64_t> max_float8_D, c10::optional<int64_t> fp8_exponent_bits, c10::optional<int64_t> fp8_exponent_bias, c10::optional<Tensor> cache_hash_size_cumsum, c10::optional<int64_t> total_cache_hash_size, c10::optional<Tensor> cache_index_table_map, c10::optional<Tensor> lxu_cache_state, c10::optional<Tensor> lxu_state)
    -
    -void pruned_hashmap_insert_unweighted_cpu(Tensor indices, Tensor dense_indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets)
    +
    +void pruned_hashmap_insert_unweighted_cpu(Tensor indices, Tensor dense_indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets)
    -
    -Tensor pruned_hashmap_lookup_unweighted_cpu(Tensor indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets)
    +
    +Tensor pruned_hashmap_lookup_unweighted_cpu(Tensor indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets)
    -
    -Tensor pruned_array_lookup_cpu(Tensor indices, Tensor offsets, Tensor index_remappings, Tensor index_remappings_offsets)
    +
    +Tensor pruned_array_lookup_cpu(Tensor indices, Tensor offsets, Tensor index_remappings, Tensor index_remappings_offsets)
    @@ -471,11 +554,9 @@

    CPU Operators - - + - - + diff --git a/cpp-api/input_combine.html b/cpp-api/input_combine.html index b222cb7d6..538b028cd 100644 --- a/cpp-api/input_combine.html +++ b/cpp-api/input_combine.html @@ -6,7 +6,7 @@ - + @@ -28,6 +28,8 @@ + + @@ -252,18 +254,19 @@ -

    FBGEMM_GPU General Info

    +

    FBGEMM_GPU General Info

    -

    FBGEMM_GPU Python API

    +

    FBGEMM_GPU Python API

    -

    FBGEMM_GPU C++ API

    +

    FBGEMM_GPU C++ API

    • Sparse Data Operators
    • Quantization Operators
    • @@ -351,15 +354,15 @@
      -

      Combine Input Operators

      +

      Combine Input Operators

      -
      -std::tuple<at::Tensor, at::Tensor, at::Tensor> tbe_input_combine_cpu(const std::vector<at::Tensor> &indices_list, const std::vector<at::Tensor> &offsets_list, const std::vector<at::Tensor> &per_sample_weights, const at::Tensor &include_last_offsets)
      +
      +std::tuple<at::Tensor, at::Tensor, at::Tensor> tbe_input_combine_cpu(const std::vector<at::Tensor> &indices_list, const std::vector<at::Tensor> &offsets_list, const std::vector<at::Tensor> &per_sample_weights, const at::Tensor &include_last_offsets)
      -
      -std::tuple<at::Tensor, at::Tensor, at::Tensor> padding_fused_tbe_input_combine_cpu(const std::vector<at::Tensor> &indices_list, const std::vector<at::Tensor> &offsets_list, const std::vector<at::Tensor> &per_sample_weights, const at::Tensor &include_last_offsets, int64_t batch_size)
      +
      +std::tuple<at::Tensor, at::Tensor, at::Tensor> padding_fused_tbe_input_combine_cpu(const std::vector<at::Tensor> &indices_list, const std::vector<at::Tensor> &offsets_list, const std::vector<at::Tensor> &per_sample_weights, const at::Tensor &include_last_offsets, int64_t batch_size)
      @@ -423,11 +426,9 @@

      Combine Input Operators - - + - - + diff --git a/cpp-api/jagged_tensor_ops.html b/cpp-api/jagged_tensor_ops.html index 52262b7d5..0f7279ea5 100644 --- a/cpp-api/jagged_tensor_ops.html +++ b/cpp-api/jagged_tensor_ops.html @@ -6,7 +6,7 @@ - + @@ -28,6 +28,8 @@ + + @@ -252,18 +254,19 @@ -

      FBGEMM_GPU General Info

      +

      FBGEMM_GPU General Info

      -

      FBGEMM_GPU Python API

      +

      FBGEMM_GPU Python API

      -

      FBGEMM_GPU C++ API

      +

      FBGEMM_GPU C++ API

      • Sparse Data Operators
      • Quantization Operators
      • @@ -351,67 +354,67 @@
        -

        Jagged Tensor Operators

        +

        Jagged Tensor Operators

        Jagged Tensor solves the issue when rows in dimension are of different length. This often occurs in sparse feature inputs in recommender systems, as well as natural language processing system batched inputs.

        -

        CUDA Operators

        +

        CUDA Operators

        -
        -at::Tensor jagged_to_padded_dense_forward(const Tensor &values, const std::vector<Tensor> &offsets, c10::SymIntArrayRef max_lengths, const double padding_value)
        +
        +at::Tensor jagged_to_padded_dense_forward(const Tensor &values, const std::vector<Tensor> &offsets, c10::SymIntArrayRef max_lengths, const double padding_value)
        -
        -std::tuple<Tensor, std::vector<Tensor>> jagged_dense_elementwise_add_jagged_output_cuda(const Tensor &x_values, const std::vector<Tensor> &x_offsets, const Tensor &y)
        +
        +std::tuple<Tensor, std::vector<Tensor>> jagged_dense_elementwise_add_jagged_output_cuda(const Tensor &x_values, const std::vector<Tensor> &x_offsets, const Tensor &y)

        output = x + y where x is jagged, y is dense, and output is jagged

        -

        CPU Operators

        +

        CPU Operators

        -
        -Tensor jagged_to_padded_dense(const Tensor &values, const std::vector<Tensor> &offsets, const c10::SymIntArrayRef max_lengths, const double padding_value)
        +
        +Tensor jagged_to_padded_dense(const Tensor &values, const std::vector<Tensor> &offsets, const c10::SymIntArrayRef max_lengths, const double padding_value)
        -
        -Tensor jagged_dense_elementwise_add(const Tensor &x_values, const std::vector<Tensor> &x_offsets, const Tensor &y)
        +
        +Tensor jagged_dense_elementwise_add(const Tensor &x_values, const std::vector<Tensor> &x_offsets, const Tensor &y)

        Output = x + y where x is jagged, y and output are dense

        -
        -std::tuple<Tensor, std::vector<Tensor>> jagged_dense_elementwise_mul(const Tensor &x_values, const std::vector<Tensor> &x_offsets, const Tensor &y)
        +
        +std::tuple<Tensor, std::vector<Tensor>> jagged_dense_elementwise_mul(const Tensor &x_values, const std::vector<Tensor> &x_offsets, const Tensor &y)
        -
        -Tensor batched_dense_vec_jagged_2d_mul(const Tensor &v, const Tensor &a_values, const Tensor &a_offsets)
        +
        +Tensor batched_dense_vec_jagged_2d_mul(const Tensor &v, const Tensor &a_values, const Tensor &a_offsets)
        -
        -std::tuple<Tensor, std::vector<Tensor>> dense_to_jagged(const Tensor &dense, const std::vector<Tensor> &offsets, c10::optional<at::SymInt> total_L)
        +
        +std::tuple<Tensor, std::vector<Tensor>> dense_to_jagged(const Tensor &dense, const std::vector<Tensor> &offsets, c10::optional<at::SymInt> total_L)
        -
        -std::tuple<Tensor, std::vector<Tensor>> jagged_dense_elementwise_add_jagged_output(const Tensor &x_values, const std::vector<Tensor> &x_offsets, const Tensor &y)
        +
        +std::tuple<Tensor, std::vector<Tensor>> jagged_dense_elementwise_add_jagged_output(const Tensor &x_values, const std::vector<Tensor> &x_offsets, const Tensor &y)

        Output = x + y where x is jagged, y is dense, and output is jagged

        -
        -Tensor jagged_1d_to_dense(Tensor values, Tensor offsets, c10::SymInt max_L, int64_t padding_value)
        +
        +Tensor jagged_1d_to_dense(Tensor values, Tensor offsets, c10::SymInt max_L, int64_t padding_value)
        -
        -Tensor jagged_2d_to_dense(Tensor values, Tensor offsets, c10::SymInt max_sequence_length)
        +
        +Tensor jagged_2d_to_dense(Tensor values, Tensor offsets, c10::SymInt max_sequence_length)
        @@ -480,11 +483,9 @@

        CPU Operators - - + - - + diff --git a/cpp-api/layout_transform_ops.html b/cpp-api/layout_transform_ops.html index cef98ca76..4014e6d46 100644 --- a/cpp-api/layout_transform_ops.html +++ b/cpp-api/layout_transform_ops.html @@ -6,7 +6,7 @@ - + @@ -28,6 +28,8 @@ + + @@ -252,18 +254,19 @@ -

        FBGEMM_GPU General Info

        +

        FBGEMM_GPU General Info

        -

        FBGEMM_GPU Python API

        +

        FBGEMM_GPU Python API

        -

        FBGEMM_GPU C++ API

        +

        FBGEMM_GPU C++ API

        • Sparse Data Operators
        • Quantization Operators
        • @@ -351,30 +354,30 @@
          -

          Layout Transformation Operators

          +

          Layout Transformation Operators

          -

          CUDA Operators

          +

          CUDA Operators

          -
          -Tensor recat_embedding_grad_output_cuda(Tensor grad_output, const std::vector<int64_t> &num_features_per_rank)
          +
          +Tensor recat_embedding_grad_output_cuda(Tensor grad_output, const std::vector<int64_t> &num_features_per_rank)
          -
          -Tensor recat_embedding_grad_output_mixed_D_cuda(const Tensor &grad_output, const std::vector<int64_t> &dim_sum_per_rank)
          +
          +Tensor recat_embedding_grad_output_mixed_D_cuda(const Tensor &grad_output, const std::vector<int64_t> &dim_sum_per_rank)
          -
          -Tensor recat_embedding_grad_output_mixed_D_batch_cuda(const Tensor &grad_output, const Tensor &dim_sum_per_rank, const Tensor &cumsum_dim_sum_per_rank)
          +
          +Tensor recat_embedding_grad_output_mixed_D_batch_cuda(const Tensor &grad_output, const Tensor &dim_sum_per_rank, const Tensor &cumsum_dim_sum_per_rank)
          -

          CPU Operators

          +

          CPU Operators

          -
          -Tensor recat_embedding_grad_output_mixed_D_cpu(const Tensor &grad_output, const std::vector<int64_t> &dim_sum_per_rank)
          +
          +Tensor recat_embedding_grad_output_mixed_D_cpu(const Tensor &grad_output, const std::vector<int64_t> &dim_sum_per_rank)
          @@ -443,11 +446,9 @@

          CPU Operators - - + - - + diff --git a/cpp-api/memory_utils.html b/cpp-api/memory_utils.html index 9144f91fb..bd250f75f 100644 --- a/cpp-api/memory_utils.html +++ b/cpp-api/memory_utils.html @@ -6,7 +6,7 @@ - + @@ -28,6 +28,8 @@ + + @@ -252,18 +254,19 @@ -

          FBGEMM_GPU General Info

          +

          FBGEMM_GPU General Info

          -

          FBGEMM_GPU Python API

          +

          FBGEMM_GPU Python API

          -

          FBGEMM_GPU C++ API

          +

          FBGEMM_GPU C++ API

          • Sparse Data Operators
          • Quantization Operators
          • @@ -351,11 +354,220 @@
            -

            CUDA Memory Operators

            -
            -

            Warning

            -

            doxygengroup: Cannot find group “memory-utils” in doxygen xml output for project “fbgemm_gpu” from directory: ../build/xml/

            +

            CUDA Memory Operators

            +
            +
            +Tensor new_managed_tensor(const Tensor &self, const std::vector<std::int64_t> &sizes)
            +

            Allocate an at::Tensor with unified managed memory (UVM). Then set its preferred storage location to CPU (host memory) and establish mappings on the CUDA device to the host memory.

            +
            +
            Parameters:
            +
              +
            • self – The input tensor

            • +
            • sizes – The target tensor dimensions

            • +
            +
            +
            Returns:
            +

            A new tensor backed by UVM

            +
            +
            +
            + +
            +
            +Tensor new_managed_tensor_meta(const Tensor &self, const std::vector<std::int64_t> &sizes)
            +

            Placeholder operator for the Meta dispatch key.

            +
            +
            Parameters:
            +
              +
            • self – The input tensor

            • +
            • sizes – The target tensor dimensions

            • +
            +
            +
            Returns:
            +

            A new empty tensor

            +
            +
            +
            + +
            +
            +Tensor new_host_mapped_tensor(const Tensor &self, const std::vector<std::int64_t> &sizes)
            +

            Allocate the at::Tensor with host-mapped memory.

            +
            +
            Parameters:
            +
              +
            • self – The input tensor

            • +
            • sizes – The target tensor dimensions

            • +
            +
            +
            Returns:
            +

            A new tensor backed by host-mapped memory

            +
            +
            +
            + +
            +
            +Tensor new_unified_tensor(const Tensor &self, const std::vector<std::int64_t> &sizes, bool is_host_mapped)
            +

            Allocate the at::Tensor with either unified managed memory (UVM) or host-mapped memory.

            +
            +
            Parameters:
            +
              +
            • self – The input tensor

            • +
            • sizes – The target tensor dimensions

            • +
            • is_host_mapped – Whether to allocate UVM or host-mapped memory

            • +
            +
            +
            Returns:
            +

            A new tensor backed by UVM or host-mapped memory, depending on the value of is_host_mapped

            +
            +
            +
            + +
            +
            +Tensor new_vanilla_managed_tensor(const Tensor &self, const std::vector<std::int64_t> &sizes)
            +

            Allocate an at::Tensor with unified managed memory (UVM), but allow for its preferred storage location to be automatically managed.

            +
            +
            Parameters:
            +
              +
            • self – The input tensor

            • +
            • sizes – The target tensor dimensions

            • +
            +
            +
            Returns:
            +

            A new tensor backed by UVM

            +
            +
            +
            + +
            +
            +bool uvm_storage(const Tensor &self)
            +

            Check if a tensor is allocated with UVM (either CPU or GPU tensor).

            +
            +
            Parameters:
            +

            self – The input tensor

            +
            +
            Returns:
            +

            true if the tensor is allocated with UVM, otherwise false

            +
            +
            +
            + +
            +
            +bool is_uvm_tensor(const Tensor &self)
            +

            Check if a tensor is allocated with UVM, BUT is not a CPU tensor.

            +
            +
            Parameters:
            +

            self – The input tensor

            +
            +
            Returns:
            +

            true if the tensor is a non-CPU tensor allocated with UVM, otherwise false

            +
            +
            +
            + +
            +
            +Tensor uvm_to_cpu(const Tensor &self)
            +

            Convert a UVM tensor to a CPU tensor.

            +
            +
            Parameters:
            +

            self – The input tensor

            +
            +
            Returns:
            +

            A new tensor that is effectively the input moved from UVM to CPU

            +
            +
            +
            + +
            +
            +Tensor uvm_to_device(const Tensor &self, const Tensor &prototype)
            +

            Create a new UVM tensor that shares the same device and UVM storage with prototype.

            +
            +
            Parameters:
            +
              +
            • self – The input tensor

            • +
            • prototype – The target tensor whose device and and UVM storage will be shared with the new tensor

            • +
            +
            +
            Returns:
            +

            A new tensor that shares the same device and UVM storage with prototype.

            +
            +
            +
            + +
            +
            +void uvm_cuda_mem_advise(const Tensor &self, int64_t cuda_memory_advise)
            +

            Call cudaMemAdvise() on a UVM tensor’s storage. The cudaMemoryAdvise enum is available on the Python side in the fbgemm_gpu.uvm namespace; see the documentation over there for valid values.

            +

            +

            See also

            +

            See here For more information on the cudaMemoryAdvise enum.

            +
            +

            +
            +
            Parameters:
            +
              +
            • self – The input tensor

            • +
            • cuda_memory_advise – The cudaMemoryAdvise enum value, as integer

            • +
            +
            +
            +
            + +
            +
            +void uvm_cuda_mem_prefetch_async(const Tensor &self, c10::optional<Tensor> device_t)
            +

            Call cudaMemPrefetchAsync() on a UVM tensor’s storage to prefetch memory to a destination device.

            +

            +

            See also

            +

            See here For more information on cudaMemPrefetchAsync().

            +

            +
            +
            Parameters:
            +
              +
            • self – The input tensor

            • +
            • device_t[OPTIONAL] The tensor whose device will be the prefetch destination

            • +
            +
            +
            +
            + +
            +
            +void uvm_mem_advice_dont_fork(const Tensor &self)
            +

            Call madvise(...MADV_DONTFORK) on a UVM tensor’s storage. This is a workaround for an issue where the UVM kernel driver un-maps UVM storage pages from the page table on fork, causing slowdown on the next access from a CPU.

            +

            +

            See also

            +

            See here For more information on madvise().

            +
            +

            +
            +
            Parameters:
            +

            self – The input tensor

            +
            +
            +
            + +
            +
            +Tensor uvm_to_cpu_clone(const Tensor &self)
            +

            Copy a UVM tensor’s contiguous storage (uvm_storage(t) is true) into a new CPU Tensor. The copy operation uses single-threaded memcpy().

            +
            +
            Parameters:
            +

            self – The input tensor

            +
            +
            Returns:
            +

            A new CPU tensor containing the data copied from the UVM tensor

            +
            +
            +
            +
            @@ -417,11 +629,9 @@

            CUDA Memory Operators - - + - - + diff --git a/cpp-api/merge_pooled_embeddings.html b/cpp-api/merge_pooled_embeddings.html index 98e6e1823..95190305a 100644 --- a/cpp-api/merge_pooled_embeddings.html +++ b/cpp-api/merge_pooled_embeddings.html @@ -6,7 +6,7 @@ - + @@ -28,6 +28,8 @@ + + @@ -252,18 +254,19 @@ -

            FBGEMM_GPU General Info

            +

            FBGEMM_GPU General Info

            -

            FBGEMM_GPU Python API

            +

            FBGEMM_GPU Python API

            -

            FBGEMM_GPU C++ API

            +

            FBGEMM_GPU C++ API

            • Sparse Data Operators
            • Quantization Operators
            • @@ -351,58 +354,58 @@
              -

              Pooled Embeddings Operators

              +

              Pooled Embeddings Operators

              This section includes CUDA and CPU operators for various operations with pooled embeddings, including merge and permutation operators.

              -

              Merge Operators

              +

              Merge Operators

              -
              -std::vector<at::Tensor> all_to_one_device(std::vector<at::Tensor> inputTensors, at::Device target_device)
              +
              +std::vector<at::Tensor> all_to_one_device(std::vector<at::Tensor> inputTensors, at::Device target_device)
              -

              Permutation Operators

              +

              Permutation Operators

              -
              -at::Tensor permute_pooled_embs_split_gpu(const at::Tensor &pooled_embs, const at::Tensor &offset_dim_list, const at::Tensor &permute_list, const at::Tensor &inv_offset_dim_list, const at::Tensor &inv_permute_list)
              +
              +at::Tensor permute_pooled_embs_split_gpu(const at::Tensor &pooled_embs, const at::Tensor &offset_dim_list, const at::Tensor &permute_list, const at::Tensor &inv_offset_dim_list, const at::Tensor &inv_permute_list)
              -
              -at::Tensor permute_pooled_embs_auto_grad_split_gpu(const at::Tensor &pooled_embs, const at::Tensor &offset_dim_list, const at::Tensor &permute_list, const at::Tensor &inv_offset_dim_list, const at::Tensor &inv_permute_list)
              +
              +at::Tensor permute_pooled_embs_auto_grad_split_gpu(const at::Tensor &pooled_embs, const at::Tensor &offset_dim_list, const at::Tensor &permute_list, const at::Tensor &inv_offset_dim_list, const at::Tensor &inv_permute_list)
              -
              -Tensor permute_pooled_embs_auto_grad_gpu(const Tensor &pooled_embs, const Tensor &offset_dim_list, const Tensor &permute_list, const Tensor &inv_offset_dim_list, const Tensor &inv_permute_list)
              +
              +Tensor permute_pooled_embs_auto_grad_gpu(const Tensor &pooled_embs, const Tensor &offset_dim_list, const Tensor &permute_list, const Tensor &inv_offset_dim_list, const Tensor &inv_permute_list)
              -
              -at::Tensor permute_pooled_embs_split_cpu(const at::Tensor &pooled_embs, const at::Tensor &offset_dim_list, const at::Tensor &permute_list, const at::Tensor &inv_offset_dim_list, const at::Tensor &inv_permute_list)
              +
              +at::Tensor permute_pooled_embs_cpu_impl(const at::Tensor &pooled_embs, const at::Tensor &offset_dim_list, const at::Tensor &permute_list, const at::Tensor &inv_offset_dim_list, const at::Tensor &inv_permute_list, const bool &allow_duplicates)
              -
              -at::Tensor permute_pooled_embs_auto_grad_split_cpu(const at::Tensor &pooled_embs, const at::Tensor &offset_dim_list, const at::Tensor &permute_list, const at::Tensor &inv_offset_dim_list, const at::Tensor &inv_permute_list)
              +
              +at::Tensor permute_pooled_embs_split_cpu(const at::Tensor &pooled_embs, const at::Tensor &offset_dim_list, const at::Tensor &permute_list, const at::Tensor &inv_offset_dim_list, const at::Tensor &inv_permute_list)
              -
              -at::Tensor permute_pooled_embs_cpu(const at::Tensor &pooled_embs, const at::Tensor &offset_dim_list, const at::Tensor &permute_list, const at::Tensor &inv_offset_dim_list, const at::Tensor &inv_permute_list)
              +
              +at::Tensor permute_pooled_embs_auto_grad_split_cpu(const at::Tensor &pooled_embs, const at::Tensor &offset_dim_list, const at::Tensor &permute_list, const at::Tensor &inv_offset_dim_list, const at::Tensor &inv_permute_list)
              -
              -at::Tensor permute_pooled_embs_auto_grad(const Tensor &pooled_embs, const Tensor &offset_dim_list, const Tensor &permute_list, const Tensor &inv_offset_dim_list, const Tensor &inv_permute_list)
              +
              +at::Tensor permute_pooled_embs_auto_grad(const Tensor &pooled_embs, const Tensor &offset_dim_list, const Tensor &permute_list, const Tensor &inv_offset_dim_list, const Tensor &inv_permute_list)
              -
              -at::Tensor permute_pooled_embs_auto_grad_cpu(const Tensor &pooled_embs, const Tensor &offset_dim_list, const Tensor &permute_list, const Tensor &inv_offset_dim_list, const Tensor &inv_permute_list)
              +
              +at::Tensor permute_pooled_embs_auto_grad_cpu(const Tensor &pooled_embs, const Tensor &offset_dim_list, const Tensor &permute_list, const Tensor &inv_offset_dim_list, const Tensor &inv_permute_list)
              @@ -471,11 +474,9 @@

              Permutation Operators - - + - - + diff --git a/cpp-api/quantize_ops.html b/cpp-api/quantize_ops.html index 41c2823a6..e7b0739ac 100644 --- a/cpp-api/quantize_ops.html +++ b/cpp-api/quantize_ops.html @@ -6,7 +6,7 @@ - + @@ -28,6 +28,8 @@ + + @@ -252,18 +254,19 @@ -

              FBGEMM_GPU General Info

              +

              FBGEMM_GPU General Info

              -

              FBGEMM_GPU Python API

              +

              FBGEMM_GPU Python API

              -

              FBGEMM_GPU C++ API

              +

              FBGEMM_GPU C++ API

              • Sparse Data Operators
              • Quantization Operators
              • @@ -351,183 +354,184 @@
                -

                Quantization Operators

                +

                Quantization Operators

                Quantization is a model optimization technique to reduce the size of a large -model in order to achieve better storage performance with a small loss in accuracy.

                +model in order to achieve better storage performance with a small loss in +accuracy.

                -

                CUDA Operators

                +

                CUDA Operators

                -
                -DLL_PUBLIC at::Tensor _float_to_bfloat16_gpu (const at::Tensor &input)
                +
                +DLL_PUBLIC at::Tensor _float_to_bfloat16_gpu (const at::Tensor &input)

                Converts a tensor of float values into a tensor of Brain Floating Point (bfloat16) values.

                -
                -DLL_PUBLIC at::Tensor _bfloat16_to_float_gpu (const at::Tensor &input)
                +
                +DLL_PUBLIC at::Tensor _bfloat16_to_float_gpu (const at::Tensor &input)

                Converts a tensor of Brain Floating Point (bfloat16) values into a tensor of float values.

                -
                -DLL_PUBLIC Tensor _float_to_FP8rowwise_gpu (const Tensor &input, const bool forward)
                +
                +DLL_PUBLIC Tensor _float_to_FP8rowwise_gpu (const Tensor &input, const bool forward)
                -
                -DLL_PUBLIC Tensor _float_to_fused8bitrowwise_gpu (const Tensor &input)
                +
                +DLL_PUBLIC Tensor _float_to_fused8bitrowwise_gpu (const Tensor &input)
                -
                -DLL_PUBLIC Tensor _single_or_half_precision_to_fused8bitrowwise_gpu (const Tensor &input)
                +
                +DLL_PUBLIC Tensor _single_or_half_precision_to_fused8bitrowwise_gpu (const Tensor &input)
                -
                -DLL_PUBLIC at::Tensor _fused8bitrowwise_to_single_or_half_precision_gpu (const at::Tensor &input, const int64_t output_dtype)
                +
                +DLL_PUBLIC at::Tensor _fused8bitrowwise_to_single_or_half_precision_gpu (const at::Tensor &input, const int64_t output_dtype)
                -
                -DLL_PUBLIC at::Tensor _fused8bitrowwise_to_float_mixed_dim_gpu (const at::Tensor &input, const at::Tensor &D_offsets, const int64_t output_dtype)
                +
                +DLL_PUBLIC at::Tensor _fused8bitrowwise_to_float_mixed_dim_gpu (const at::Tensor &input, const at::Tensor &D_offsets, const int64_t output_dtype)
                -
                -template<typename input_t>
                Tensor _float_to_fusednbitrowwise_gpu_t(const Tensor &input, const int64_t bit_rate)
                +
                +template<typename input_t>
                Tensor _float_to_fusednbitrowwise_gpu_t(const Tensor &input, const int64_t bit_rate)
                -
                -DLL_PUBLIC Tensor _float_to_fusednbitrowwise_gpu (const Tensor &input, const int64_t bit_rate)
                +
                +DLL_PUBLIC Tensor _float_to_fusednbitrowwise_gpu (const Tensor &input, const int64_t bit_rate)
                -
                -DLL_PUBLIC at::Tensor _half_to_fusednbitrowwise_gpu (const at::Tensor &input, const int64_t bit_rate)
                +
                +DLL_PUBLIC at::Tensor _half_to_fusednbitrowwise_gpu (const at::Tensor &input, const int64_t bit_rate)
                -
                -template<typename output_t>
                Tensor _fusednbitrowwise_to_float_gpu_t(const Tensor &input, const int64_t bit_rate)
                +
                +template<typename output_t>
                Tensor _fusednbitrowwise_to_float_gpu_t(const Tensor &input, const int64_t bit_rate)
                -
                -DLL_PUBLIC at::Tensor _fusednbitrowwise_to_half_gpu (const at::Tensor &input, const int64_t bit_rate)
                +
                +DLL_PUBLIC at::Tensor _fusednbitrowwise_to_half_gpu (const at::Tensor &input, const int64_t bit_rate)
                -
                -DLL_PUBLIC at::Tensor _fusednbitrowwise_to_float_or_half_gpu (const at::Tensor &input, const int64_t bit_rate, const int64_t output_dtype)
                +
                +DLL_PUBLIC at::Tensor _fusednbitrowwise_to_float_or_half_gpu (const at::Tensor &input, const int64_t bit_rate, const int64_t output_dtype)
                -
                -DLL_PUBLIC at::Tensor _float_to_hfp8_gpu (const at::Tensor &input, const int64_t ebits, const int64_t exponent_bias, const double max_pos)
                +
                +DLL_PUBLIC at::Tensor _float_to_hfp8_gpu (const at::Tensor &input, const int64_t ebits, const int64_t exponent_bias, const double max_pos)

                Converts a tensor of float values into a tensor of Hybrid 8-bit Floating Point (hfp8) values.

                -
                -DLL_PUBLIC at::Tensor _hfp8_to_float_gpu (const at::Tensor &input, const int64_t ebits, const int64_t exponent_bias)
                +
                +DLL_PUBLIC at::Tensor _hfp8_to_float_gpu (const at::Tensor &input, const int64_t ebits, const int64_t exponent_bias)

                Converts a tensor of Hybrid 8-bit Floating Point (hfp8) values into a tensor of float values.

                -
                -DLL_PUBLIC at::Tensor _float_to_msfp_gpu (const at::Tensor &input, const int64_t bounding_box_size, const int64_t ebits, const int64_t mbits, const int64_t bias, const double min_pos, const double max_pos)
                +
                +DLL_PUBLIC at::Tensor _float_to_msfp_gpu (const at::Tensor &input, const int64_t bounding_box_size, const int64_t ebits, const int64_t mbits, const int64_t bias, const double min_pos, const double max_pos)

                Converts a tensor of float values into a tensor of Microsoft Floating Point (msfp) values.

                -
                -DLL_PUBLIC at::Tensor _msfp_to_float_gpu (const at::Tensor &input, const int64_t ebits, const int64_t mbits, const int64_t bias)
                +
                +DLL_PUBLIC at::Tensor _msfp_to_float_gpu (const at::Tensor &input, const int64_t ebits, const int64_t mbits, const int64_t bias)

                Converts a tensor of Microsoft Floating Point (msfp) values into a tensor of float values.

                -
                -DLL_PUBLIC Tensor _float_to_paddedFP8rowwise_gpu (const Tensor &input, const bool forward, const int64_t row_dim)
                +
                +DLL_PUBLIC Tensor _float_to_paddedFP8rowwise_gpu (const Tensor &input, const bool forward, const int64_t row_dim)
                -

                CPU Operators

                +

                CPU Operators

                -
                -Tensor &_fused8bitrowwise_to_float_cpu_out(Tensor &output, const Tensor &input)
                +
                +Tensor &_fused8bitrowwise_to_float_cpu_out(Tensor &output, const Tensor &input)
                -
                -Tensor &_float_to_fused8bitrowwise_cpu_out(Tensor &output, const Tensor &input)
                +
                +Tensor &_float_to_fused8bitrowwise_cpu_out(Tensor &output, const Tensor &input)
                -
                -Tensor float_to_fused8bitrowwise_cpu(const Tensor &input)
                +
                +Tensor float_to_fused8bitrowwise_cpu(const Tensor &input)
                -
                -Tensor half_to_fused8bitrowwise_cpu(const Tensor &input)
                +
                +Tensor half_to_fused8bitrowwise_cpu(const Tensor &input)
                -
                -Tensor float_or_half_to_fused8bitrowwise_cpu(const Tensor &input)
                +
                +Tensor float_or_half_to_fused8bitrowwise_cpu(const Tensor &input)
                -
                -Tensor fused8bitrowwise_to_float_cpu(const Tensor &input)
                +
                +Tensor fused8bitrowwise_to_float_cpu(const Tensor &input)
                -
                -Tensor fused8bitrowwise_to_half_cpu(const Tensor &input)
                +
                +Tensor fused8bitrowwise_to_half_cpu(const Tensor &input)
                -
                -Tensor fused8bitrowwise_to_float_or_half_cpu(const Tensor &input, const int64_t output_dtype)
                +
                +Tensor fused8bitrowwise_to_float_or_half_cpu(const Tensor &input, const int64_t output_dtype)
                -
                -Tensor float_to_FP8rowwise_cpu(const Tensor &input, bool forward)
                +
                +Tensor float_to_FP8rowwise_cpu(const Tensor &input, bool forward)
                -
                -Tensor FP8rowwise_to_float_cpu(const Tensor &input, bool forward, const int64_t output_dtype)
                +
                +Tensor FP8rowwise_to_float_cpu(const Tensor &input, bool forward, const int64_t output_dtype)
                -
                -Tensor fusednbitrowwise_to_float_cpu(const Tensor &input, const int64_t bit_rate)
                +
                +Tensor fusednbitrowwise_to_float_cpu(const Tensor &input, const int64_t bit_rate)
                -
                -Tensor fusednbitrowwise_to_half_cpu(const Tensor &input, const int64_t bit_rate)
                +
                +Tensor fusednbitrowwise_to_half_cpu(const Tensor &input, const int64_t bit_rate)
                -
                -Tensor fusednbitrowwise_to_float_or_half_cpu(const Tensor &input, const int64_t bit_rate, const int64_t output_dtype)
                +
                +Tensor fusednbitrowwise_to_float_or_half_cpu(const Tensor &input, const int64_t bit_rate, const int64_t output_dtype)
                -
                -void FloatToFP8Quantized_ref(const float *const input, const size_t nrows, const size_t ncols, uint8_t *const output, const int ebits, const int exponent_bias, const double max_pos)
                +
                +void FloatToFP8Quantized_ref(const float *const input, const size_t nrows, const size_t ncols, uint8_t *const output, const int ebits, const int exponent_bias, const double max_pos)
                -
                -void FP8QuantizedToFloat_ref(const uint8_t *const input, const size_t nrows, const size_t ncols, float *const output, const int ebits, const int exponent_bias)
                +
                +void FP8QuantizedToFloat_ref(const uint8_t *const input, const size_t nrows, const size_t ncols, float *const output, const int ebits, const int exponent_bias)
                @@ -596,11 +600,9 @@

                CPU Operators - - + - - + diff --git a/cpp-api/sparse_ops.html b/cpp-api/sparse_ops.html index 3d498eaca..04813a7ce 100644 --- a/cpp-api/sparse_ops.html +++ b/cpp-api/sparse_ops.html @@ -6,7 +6,7 @@ - + @@ -28,6 +28,8 @@ + + @@ -252,18 +254,19 @@ -

                FBGEMM_GPU General Info

                +

                FBGEMM_GPU General Info

                -

                FBGEMM_GPU Python API

                +

                FBGEMM_GPU Python API

                -

                FBGEMM_GPU C++ API

                +

                FBGEMM_GPU C++ API

                • Sparse Data Operators
                • Quantization Operators
                • @@ -351,87 +354,91 @@
                  -

                  Sparse Data Operators

                  -
                  -

                  Sparse Data CUDA Operators

                  +

                  Sparse Data Operators

                  +
                  +

                  CUDA Operators

                  -
                  -at::Tensor expand_into_jagged_permute_cuda(const at::Tensor &permute, const at::Tensor &input_offsets, const at::Tensor &output_offsets, int64_t output_size)
                  +
                  +at::Tensor expand_into_jagged_permute_cuda(const at::Tensor &permute, const at::Tensor &input_offsets, const at::Tensor &output_offsets, int64_t output_size)

                  expand_into_jagged_permute expand the sparse data permute index from table dimension to batch dimension, for cases where the sparse features has different batch sizes across ranks.

                  -

                  -
                  Return

                  The output follows the following formula:

                  output_permute[table_offset[permute[table]] + batch] <- bag_offset[batch]
                  +
                  +
                  Parameters:
                  +
                    +
                  • permute – the table level permute index.

                  • +
                  • input_offsets – the exclusive offsets of table-level length.

                  • +
                  • output_offsets – the exclusive offsets of table-level permuted length. The op expands the permute from table level to batch level by contiguously mapping each bag of its corresponding tables to the position the batch sits on after feature permute. We will derive offset array of table and batch to compute the output permute.

                  • +
                  +
                  +
                  Returns:
                  +

                  The output follows the following formula:

                  output_permute[table_offset[permute[table]] + batch] <- bag_offset[batch]
                   
                  -

                  -
                  -
                  Parameters
                    -
                  • permute: the table level permute index.

                  • -
                  • input_offsets: the exclusive offsets of table-level length.

                  • -
                  • output_offsets: the exclusive offsets of table-level permuted length. The op expands the permute from table level to batch level by contiguously mapping each bag of its corresponding tables to the position the batch sits on after feature permute. We will derive offset array of table and batch to compute the output permute.

                  • -
                  +

                  -

                  -
                  -DLL_PUBLIC Tensor _float_or_half_to_fusednbitrowwise_gpu (const Tensor &input, const int64_t bit_rate)
                  +
                  +DLL_PUBLIC Tensor _float_or_half_to_fusednbitrowwise_gpu (const Tensor &input, const int64_t bit_rate)
                  -
                  -

                  Sparse Data CPU Operators

                  +
                  +

                  CPU Operators

                  -
                  -std::tuple<at::Tensor, at::Tensor> histogram_binning_calibration_cpu(const at::Tensor &logit, const at::Tensor &bin_num_examples, const at::Tensor &bin_num_positives, double positive_weight, double lower_bound = 0.0, double upper_bound = 1.0, int64_t bin_ctr_in_use_after = 0, double bin_ctr_weight_value = 1.0)
                  -

                  Divide the prediction range (e.g., [0, 1]) into B bins. In each bin, use two parameters to store the number of positive examples and the number of examples that fall into this bucket. So we basically have a histogram for the model prediction. As a result, for each bin, we have a statistical value for the real CTR (num_pos / num_example). We use this statistical value as the final calibrated prediction if the pre-cali prediction falls into the corresponding bin. In this way, the predictions within each bin should be well-calibrated if we have sufficient examples. That is, we have a fine-grained calibrated model by this calibration module. Theoretically, this calibration layer can fix any uncalibrated model or prediction if we have sufficient bins and examples.

                  -
                  Return

                  [calibrated_prediction, bin_ids]

                  -
                  -
                  Parameters
                    -
                  • logit: is input tensor before applying Sigmoid. Assumes positive weight calibration is used for calibartion target, and

                  • -
                  • positive_weight: is passed as input argument. The number of bins is automatically derived from bin_num_examples, and bin_num_positives, all of which should be the same size.

                  • -
                  • lower/upper_bound: Bounds of the bins.

                  • -
                  • bin_ctr_in_use_after: We will use the calibration_target for the final calibrated prediction if we don’t have sufficient examples. Only use the statistical value of bin CTR after we observe bin_ctr_in_use_after examples that fall in this bin. Default value: 0.

                  • -
                  • bin_ctr_weight_value: Weight for statistical value of bin CTR. When this is specified, we perform a weighted sum for the statisctical bin CTR and the calibration_target:

                    final_calibrated_prediction = bin_ctr_weight * bin_ctr + (1 -
                    +
                    +std::tuple<at::Tensor, at::Tensor> histogram_binning_calibration_cpu(const at::Tensor &logit, const at::Tensor &bin_num_examples, const at::Tensor &bin_num_positives, double positive_weight, double lower_bound = 0.0, double upper_bound = 1.0, int64_t bin_ctr_in_use_after = 0, double bin_ctr_weight_value = 1.0)
                    +

                    Divide the prediction range (e.g., [0, 1]) into B bins. In each bin, use two parameters to store the number of positive examples and the number of examples that fall into this bucket. So we basically have a histogram for the model prediction. As a result, for each bin, we have a statistical value for the real CTR (num_pos / num_example). We use this statistical value as the final calibrated prediction if the pre-cali prediction falls into the corresponding bin. In this way, the predictions within each bin should be well-calibrated if we have sufficient examples. That is, we have a fine-grained calibrated model by this calibration module. Theoretically, this calibration layer can fix any uncalibrated model or prediction if we have sufficient bins and examples.

                    +
                    +
                    Parameters:
                    +
                      +
                    • logit – is input tensor before applying Sigmoid. Assumes positive weight calibration is used for calibartion target, and

                    • +
                    • positive_weight – is passed as input argument. The number of bins is automatically derived from bin_num_examples, and bin_num_positives, all of which should be the same size.

                    • +
                    • lower/upper_bound – Bounds of the bins.

                    • +
                    • bin_ctr_in_use_after – We will use the calibration_target for the final calibrated prediction if we don’t have sufficient examples. Only use the statistical value of bin CTR after we observe bin_ctr_in_use_after examples that fall in this bin. Default value: 0.

                    • +
                    • bin_ctr_weight_value – Weight for statistical value of bin CTR. When this is specified, we perform a weighted sum for the statisctical bin CTR and the calibration_target:

                      final_calibrated_prediction = bin_ctr_weight * bin_ctr + (1 -
                       bin_ctr_weight) * calibration_target
                       
                      Default value: 1.0

                    +
                    Returns:
                    +

                    [calibrated_prediction, bin_ids]

                    +
                    -

                  -
                  -std::tuple<at::Tensor, at::Tensor> generic_histogram_binning_calibration_by_feature_cpu(const at::Tensor &logit, const at::Tensor &segment_value, const at::Tensor &segment_lengths, int64_t num_segments, const at::Tensor &bin_num_examples, const at::Tensor &bin_num_positives, const at::Tensor &bin_boundaries, double positive_weight, int64_t bin_ctr_in_use_after = 0, double bin_ctr_weight_value = 1.0)
                  +
                  +std::tuple<at::Tensor, at::Tensor> generic_histogram_binning_calibration_by_feature_cpu(const at::Tensor &logit, const at::Tensor &segment_value, const at::Tensor &segment_lengths, int64_t num_segments, const at::Tensor &bin_num_examples, const at::Tensor &bin_num_positives, const at::Tensor &bin_boundaries, double positive_weight, int64_t bin_ctr_in_use_after = 0, double bin_ctr_weight_value = 1.0)

                  An extension of histogram binning calibration model which divides data into bins based on one specific feature and prediction/ECTR range. In each bin, use two parameters to store the number of positive examples and the number of examples that fall into this bucket. So we basically have a histogram for the model prediction. As a result, for each bin, we have a statistical value for the real CTR (num_pos / num_example). We use this statistical value as the final calibrated prediction if the pre-cali prediction falls into the corresponding bin. In this way, the predictions within each bin should be well-calibrated if we have sufficient examples. That is, we have a fine-grained calibrated model by this calibration module. Theoretically, this calibration layer can fix any uncalibrated model or prediction if we have sufficient bins and examples.

                  -

                  Assumes positive weight calibration is used for calibartion target, and

                  -positive_weight is passed as input argument.

                  Same as above, but accepts generic “bin_boundaries”, which is assumed to be sorted.

                  -
                  -
                  Return

                  [calibrated_prediction, bin_ids]

                  -
                  -
                  Return

                  calibrated_prediction.

                  -
                  -
                  Parameters
                    -
                  • logit: is input tensor before applying Sigmoid.

                  • -
                  -
                  -
                  Parameters
                    -
                  • segment_value/lengths: Values and lengths in KeyJaggedTensor. Assumes value of length is either 0 or 1.

                  • -
                  • num_bins: # of bins is no longer the same as bin_num_examples, and bin_num_positives, all of which should be still the same size.

                  • -
                  • lower/upper_bound: Bounds of the bins.

                  • -
                  • bin_ctr_in_use_after: We will use the calibration_target for the final calibrated prediction if we don’t have sufficient examples. Only use the statistical value of bin CTR after we observe bin_ctr_in_use_after examples that fall in this bin. Default value is 0. @parambin_ctr_weight_value Weight for statistical value of bin CTR. When this is specified, we perform a weighted sum for the statisctical bin CTR and the calibration_target:

                    final_calibrated_prediction = bin_ctr_weight * bin_ctr + (1 -
                    +

                    +Assumes positive weight calibration is used for calibartion target, and positive_weight

                    is passed as input argument.

                    +

                    Same as above, but accepts generic “bin_boundaries”, which is assumed to be sorted.

                    +

                    +
                    +
                    Parameters:
                    +
                      +
                    • logit – is input tensor before applying Sigmoid.

                    • +
                    • segment_value/lengths – Values and lengths in KeyJaggedTensor. Assumes value of length is either 0 or 1.

                    • +
                    • num_bins – # of bins is no longer the same as bin_num_examples, and bin_num_positives, all of which should be still the same size.

                    • +
                    • lower/upper_bound – Bounds of the bins.

                    • +
                    • bin_ctr_in_use_after – We will use the calibration_target for the final calibrated prediction if we don’t have sufficient examples. Only use the statistical value of bin CTR after we observe bin_ctr_in_use_after examples that fall in this bin. Default value is 0. @parambin_ctr_weight_value Weight for statistical value of bin CTR. When this is specified, we perform a weighted sum for the statisctical bin CTR and the calibration_target:

                      final_calibrated_prediction = bin_ctr_weight * bin_ctr + (1 -
                       bin_ctr_weight) * calibration_target.
                       
                      Default value: 1.0

                    +
                    Returns:
                    +

                    [calibrated_prediction, bin_ids]

                    +
                    +
                    Returns:
                    +

                    calibrated_prediction.

                    +
                    -

                  @@ -481,8 +488,8 @@

                  Sparse Data CPU Operators @@ -500,11 +507,9 @@

                  Sparse Data CPU Operators - - + - - + diff --git a/cpp-api/split_table_batched_embeddings.html b/cpp-api/split_table_batched_embeddings.html index 602dba29a..3a28063c3 100644 --- a/cpp-api/split_table_batched_embeddings.html +++ b/cpp-api/split_table_batched_embeddings.html @@ -6,7 +6,7 @@ - + @@ -28,6 +28,8 @@ + + @@ -252,18 +254,19 @@ -

                  FBGEMM_GPU General Info

                  +

                  FBGEMM_GPU General Info

                  -

                  FBGEMM_GPU Python API

                  +

                  FBGEMM_GPU Python API

                  -

                  FBGEMM_GPU C++ API

                  +

                  FBGEMM_GPU C++ API

                  • Sparse Data Operators
                  • Quantization Operators
                  • @@ -351,99 +354,99 @@
                    -

                    Table Batched Embedding Operators

                    +

                    Table Batched Embedding Operators

                    -
                    -std::tuple<at::Tensor, at::Tensor, c10::optional<at::Tensor>> get_unique_indices_cuda(at::Tensor linear_indices, int64_t max_indices, bool compute_count)
                    +
                    +std::tuple<at::Tensor, at::Tensor, c10::optional<at::Tensor>> get_unique_indices_cuda(at::Tensor linear_indices, int64_t max_indices, bool compute_count)

                    Deduplicate indices.

                    -
                    -std::pair<at::Tensor, at::Tensor> lru_cache_find_uncached_cuda(at::Tensor unique_indices, at::Tensor unique_indices_length, int64_t max_indices, at::Tensor lxu_cache_state, int64_t time_stamp, at::Tensor lru_state, bool gather_cache_stats, at::Tensor uvm_cache_stats, bool lock_cache_line, at::Tensor lxu_cache_locking_counter)
                    +
                    +std::pair<at::Tensor, at::Tensor> lru_cache_find_uncached_cuda(at::Tensor unique_indices, at::Tensor unique_indices_length, int64_t max_indices, at::Tensor lxu_cache_state, int64_t time_stamp, at::Tensor lru_state, bool gather_cache_stats, at::Tensor uvm_cache_stats, bool lock_cache_line, at::Tensor lxu_cache_locking_counter)

                    Lookup LRU cache to find uncached indices, and then sort them based on the set.

                    -
                    -int64_t host_lxu_cache_slot(int64_t h_in, int64_t C)
                    +
                    +int64_t host_lxu_cache_slot(int64_t h_in, int64_t C)

                    Map index to cache_set. h_in: linear_indices; C: #cache_sets.

                    -
                    -at::Tensor linearize_cache_indices_cuda(at::Tensor cache_hash_size_cumsum, at::Tensor indices, at::Tensor offsets)
                    +
                    +at::Tensor linearize_cache_indices_cuda(at::Tensor cache_hash_size_cumsum, at::Tensor indices, at::Tensor offsets)

                    Linearize the indices of all tables to make it be unique

                    -
                    -at::Tensor linearize_cache_indices_from_row_idx_cuda(at::Tensor cache_hash_size_cumsum, at::Tensor update_table_indices, at::Tensor update_row_indices)
                    +
                    +at::Tensor linearize_cache_indices_from_row_idx_cuda(at::Tensor cache_hash_size_cumsum, at::Tensor update_table_indices, at::Tensor update_row_indices)

                    Linearize the indices of all tables to make it be unique. Note the update_table_indices and update_row_indices are from the row indices format for inplace update.

                    -
                    -void lru_cache_populate_cuda(at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, bool stochastic_rounding, bool gather_cache_stats, c10::optional<at::Tensor> uvm_cache_stats, bool lock_cache_line, c10::optional<at::Tensor> lxu_cache_locking_counter)
                    +
                    +void lru_cache_populate_cuda(at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, bool stochastic_rounding, bool gather_cache_stats, c10::optional<at::Tensor> uvm_cache_stats, bool lock_cache_line, c10::optional<at::Tensor> lxu_cache_locking_counter)

                    LRU cache: fetch the rows corresponding to linear_cache_indices from weights, and insert them into the cache at timestep time_stamp.

                    -
                    -void lru_cache_populate_byte_cuda(at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, int64_t row_alignment, bool gather_cache_stats, c10::optional<at::Tensor> uvm_cache_stats)
                    +
                    +void lru_cache_populate_byte_cuda(at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, int64_t row_alignment, bool gather_cache_stats, c10::optional<at::Tensor> uvm_cache_stats)

                    LRU cache: fetch the rows corresponding to linear_cache_indices from weights, and insert them into the cache at timestep time_stamp. weights and lxu_cache_weights have “uint8_t” byte elements

                    -
                    -void direct_mapped_lru_cache_populate_byte_cuda(at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, at::Tensor lxu_cache_miss_timestamp, int64_t row_alignment, bool gather_cache_stats, c10::optional<at::Tensor> uvm_cache_stats)
                    +
                    +void direct_mapped_lru_cache_populate_byte_cuda(at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, at::Tensor lxu_cache_miss_timestamp, int64_t row_alignment, bool gather_cache_stats, c10::optional<at::Tensor> uvm_cache_stats)

                    Direct-mapped (assoc=1) variant of lru_cache_populate_byte_cuda

                    -
                    -void lfu_cache_populate_cuda(at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, bool stochastic_rounding)
                    +
                    +void lfu_cache_populate_cuda(at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, bool stochastic_rounding)

                    LFU cache: fetch the rows corresponding to linear_cache_indices from weights, and insert them into the cache.

                    -
                    -void lfu_cache_populate_byte_cuda(at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, int64_t row_alignment)
                    +
                    +void lfu_cache_populate_byte_cuda(at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, int64_t row_alignment)

                    LFU cache: fetch the rows corresponding to linear_cache_indices from weights, and insert them into the cache. weights and lxu_cache_weights have “uint8_t” byte elements

                    -
                    -at::Tensor lxu_cache_lookup_cuda(at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, c10::optional<at::Tensor> uvm_cache_stats, c10::optional<at::Tensor> num_uniq_cache_indices, c10::optional<at::Tensor> lxu_cache_locations_output)
                    +
                    +at::Tensor lxu_cache_lookup_cuda(at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, c10::optional<at::Tensor> uvm_cache_stats, c10::optional<at::Tensor> num_uniq_cache_indices, c10::optional<at::Tensor> lxu_cache_locations_output)

                    Lookup the LRU/LFU cache: find the cache weights location for all indices. Look up the slots in the cache corresponding to linear_cache_indices, with a sentinel value for missing.

                    -
                    -at::Tensor direct_mapped_lxu_cache_lookup_cuda(at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, c10::optional<at::Tensor> uvm_cache_stats)
                    +
                    +at::Tensor direct_mapped_lxu_cache_lookup_cuda(at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, c10::optional<at::Tensor> uvm_cache_stats)

                    Lookup the LRU/LFU cache: find the cache weights location for all indices. Look up the slots in the cache corresponding to linear_cache_indices, with a sentinel value for missing.

                    -
                    -void lxu_cache_flush_cuda(at::Tensor uvm_weights, at::Tensor cache_hash_size_cumsum, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, int64_t total_D, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, bool stochastic_rounding)
                    +
                    +void lxu_cache_flush_cuda(at::Tensor uvm_weights, at::Tensor cache_hash_size_cumsum, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, int64_t total_D, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, bool stochastic_rounding)

                    Flush the cache: store the weights from the cache to the backing storage.

                    -
                    -void reset_weight_momentum_cuda(at::Tensor dev_weights, at::Tensor uvm_weights, at::Tensor lxu_cache_weights, at::Tensor weights_placements, at::Tensor weights_offsets, at::Tensor momentum1_dev, at::Tensor momentum1_uvm, at::Tensor momentum1_placements, at::Tensor momentum1_offsets, at::Tensor D_offsets, at::Tensor pruned_indices, at::Tensor pruned_indices_offsets, at::Tensor logical_table_ids, at::Tensor buffer_ids, at::Tensor cache_hash_size_cumsum, at::Tensor lxu_cache_state, int64_t total_cache_hash_size)
                    +
                    +void reset_weight_momentum_cuda(at::Tensor dev_weights, at::Tensor uvm_weights, at::Tensor lxu_cache_weights, at::Tensor weights_placements, at::Tensor weights_offsets, at::Tensor momentum1_dev, at::Tensor momentum1_uvm, at::Tensor momentum1_placements, at::Tensor momentum1_offsets, at::Tensor D_offsets, at::Tensor pruned_indices, at::Tensor pruned_indices_offsets, at::Tensor logical_table_ids, at::Tensor buffer_ids, at::Tensor cache_hash_size_cumsum, at::Tensor lxu_cache_state, int64_t total_cache_hash_size)
                    -
                    -void lxu_cache_locking_counter_decrement_cuda(at::Tensor lxu_cache_locking_counter, at::Tensor lxu_cache_locations)
                    +
                    +void lxu_cache_locking_counter_decrement_cuda(at::Tensor lxu_cache_locking_counter, at::Tensor lxu_cache_locations)

                    Decrement the LRU/LFU cache counter based on lxu_cache_locations.

                    -
                    -void lxu_cache_locations_update_cuda(at::Tensor lxu_cache_locations, at::Tensor lxu_cache_locations_new, c10::optional<at::Tensor> num_uniq_cache_indices)
                    +
                    +void lxu_cache_locations_update_cuda(at::Tensor lxu_cache_locations, at::Tensor lxu_cache_locations_new, c10::optional<at::Tensor> num_uniq_cache_indices)

                    Inplace update lxu_cache_locations to the new one should only update if lxu_cache_locations[i] == -1 and lxu_cache_locations_new[i] >= 0

                    @@ -508,11 +511,9 @@

                    Table Batched Embedding Operators - - + - - + diff --git a/cpu__kernel__test_8cpp.html b/cpu__kernel__test_8cpp.html new file mode 100644 index 000000000..448e548c0 --- /dev/null +++ b/cpu__kernel__test_8cpp.html @@ -0,0 +1,115 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/test/cpu_kernel_test.cpp File Reference + + + + + + + + + + + + +
                    +
                    cpu_kernel_test.cpp File Reference
                    +
                    +
                    +
                    #include <gtest/gtest.h>
                    +#include <ATen/ATen.h>
                    +#include <ATen/AccumulateType.h>
                    +#include "deeplearning/fbgemm/fbgemm_gpu/codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "torch/types.h"
                    +

                    Function Documentation

                    + +

                    ◆ TEST()

                    + +
                    +
                    + + + + + + + + + + + +
                    TEST (cpu_kernel_test ,
                    csr2csc_test  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/cpu__utils_8h.html b/cpu__utils_8h.html new file mode 100644 index 000000000..bdf1d6e81 --- /dev/null +++ b/cpu__utils_8h.html @@ -0,0 +1,105 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/cpu_utils.h File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    cpu_utils.h File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <cstdint>
                    +#include <utility>
                    +
                    + + + +

                    +Namespaces

                    namespace  fbgemm_gpu
                     
                    + + + + +

                    +Functions

                    template<typename IndexType >
                    void report_embedding_error (int t, int B, int b_begin, int b_end, const IndexType *offsets_data, const IndexType *indices_data, int64_t hash_size, bool allow_minus_one=false)
                     
                    +
                    + + + + diff --git a/cub__namespace__postfix_8cuh.html b/cub__namespace__postfix_8cuh.html new file mode 100644 index 000000000..e47e1666f --- /dev/null +++ b/cub__namespace__postfix_8cuh.html @@ -0,0 +1,102 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/cub_namespace_postfix.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    cub_namespace_postfix.cuh File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ FBGEMM_GPU_CUB_NS_PREFIX

                    + +
                    +
                    + + + + +
                    #define FBGEMM_GPU_CUB_NS_PREFIX
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/cub__namespace__prefix_8cuh.html b/cub__namespace__prefix_8cuh.html new file mode 100644 index 000000000..a0fe64898 --- /dev/null +++ b/cub__namespace__prefix_8cuh.html @@ -0,0 +1,87 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/cub_namespace_prefix.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    cub_namespace_prefix.cuh File Reference
                    +
                    +
                    +
                    + + + + diff --git a/cuda__utils_8cuh.html b/cuda__utils_8cuh.html new file mode 100644 index 000000000..e01699c82 --- /dev/null +++ b/cuda__utils_8cuh.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/cuda_utils.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    cuda_utils.cuh File Reference
                    +
                    +
                    +
                    #include <cuda.h>
                    +#include <cassert>
                    +

                    Macro Definition Documentation

                    + +

                    ◆ CUDA_CHECK

                    + +
                    +
                    + + + + + + + +
                    #define CUDA_CHECK( X)
                    +
                    +Value:
                    do { \
                    +
                    cudaError_t err = X; \
                    +
                    assert(err == cudaError::cudaSuccess); \
                    +
                    } while (0)
                    +
                    #define X(DeviceOnly, OutputRowsPerThread, InputRowsInFlight, MinNum128BRows, MaxNum128BRows)
                    +
                    +
                    +
                    +
                    + + + + diff --git a/cumem__utils_8h.html b/cumem__utils_8h.html new file mode 100644 index 000000000..b2becd87e --- /dev/null +++ b/cumem__utils_8h.html @@ -0,0 +1,126 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/cumem_utils.h File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    cumem_utils.h File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +
                    + + + +

                    +Namespaces

                    namespace  fbgemm_gpu
                     
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + +

                    +Functions

                    Tensor new_managed_tensor (const Tensor &self, const std::vector< std::int64_t > &sizes)
                     
                    Tensor new_managed_tensor_meta (const Tensor &self, const std::vector< std::int64_t > &sizes)
                     
                    Tensor new_host_mapped_tensor (const Tensor &self, const std::vector< std::int64_t > &sizes)
                     
                    Tensor new_unified_tensor (const Tensor &self, const std::vector< std::int64_t > &sizes, bool is_host_mapped)
                     
                    Tensor new_vanilla_managed_tensor (const Tensor &self, const std::vector< std::int64_t > &sizes)
                     
                    bool uvm_storage (const Tensor &self)
                     
                    bool is_uvm_tensor (const Tensor &self)
                     
                    Tensor uvm_to_cpu (const Tensor &self)
                     
                    Tensor uvm_to_device (const Tensor &self, const Tensor &prototype)
                     
                    void uvm_cuda_mem_advise (const Tensor &self, int64_t cuda_memory_advise)
                     
                    void uvm_cuda_mem_prefetch_async (const Tensor &self, c10::optional< Tensor > device_t)
                     
                    void uvm_mem_advice_dont_fork (const Tensor &self)
                     
                    Tensor uvm_to_cpu_clone (const Tensor &self)
                     
                    +
                    + + + + diff --git a/dense__to__jagged__forward_8cu.html b/dense__to__jagged__forward_8cu.html new file mode 100644 index 000000000..e445b2d50 --- /dev/null +++ b/dense__to__jagged__forward_8cu.html @@ -0,0 +1,168 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/dense_to_jagged_forward.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    dense_to_jagged_forward.cu File Reference
                    +
                    +
                    +
                    #include "common.cuh"
                    +
                    + + + +

                    +Namespaces

                    namespace  fbgemm_gpu
                     
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_DENSE_TO_JAGGED_CASE

                    + +
                    +
                    + + + + + + + +
                    #define DISPATCH_DENSE_TO_JAGGED_CASE( TYPE)
                    +
                    +Value:
                    AT_DISPATCH_CASE(TYPE, [&] { \
                    +
                    jagged_dense_elementwise_jagged_output_opt_<scalar_t>( \
                    +
                    values, \
                    + +
                    dense, \
                    +
                    output, \
                    +
                    [] __device__(scalar_t /*unused*/, scalar_t y) -> scalar_t { \
                    +
                    return y; \
                    +
                    }); \
                    +
                    })
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const int32_t const bool pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > output
                    Definition gen_batch_index_select_dim0_forward_kernel_small.cu:128
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > int64_t FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets
                    Definition gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu:104
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ FBGEMM_OP_DISPATCH()

                    + +
                    +
                    + + + + + + + + + + + + + + + + +
                    FBGEMM_OP_DISPATCH (CUDA ,
                    "dense_to_jagged_forward" ,
                    fbgemm_gpu::dense_to_jagged_forward  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/dir_0255d041b3ce7964bcd7b11954959c22.html b/dir_0255d041b3ce7964bcd7b11954959c22.html index df00fb44d..3354ae2a9 100644 --- a/dir_0255d041b3ce7964bcd7b11954959c22.html +++ b/dir_0255d041b3ce7964bcd7b11954959c22.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen Directory Reference + + @@ -29,7 +31,7 @@ - + @@ -76,10 +78,86 @@
                    codegen Directory Reference
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

                    +Files

                     batch_index_select_dim0_cpu_host.cpp
                     
                     batch_index_select_dim0_host.cpp
                     
                     embedding_backward_dense_host.cpp
                     
                     embedding_backward_dense_host_cpu.cpp
                     
                     embedding_backward_split_cpu_approx_template.cpp
                     
                     embedding_backward_split_cpu_template.cpp
                     
                     embedding_backward_split_grad_template.cu
                     
                     embedding_backward_split_host_cpu_template.cpp
                     
                     embedding_backward_split_host_template.cpp
                     
                     embedding_backward_split_indice_weights_template.cu
                     
                     embedding_backward_split_kernel_cta_template.cu
                     
                     embedding_backward_split_kernel_warp_template.cu
                     
                     embedding_backward_split_template.cu
                     
                     embedding_bounds_check.cu
                     
                     embedding_bounds_check_host.cpp
                     
                     embedding_bounds_check_host_cpu.cpp
                     
                     embedding_forward_quantized_cpu_template.cpp
                     
                     embedding_forward_quantized_host.cpp
                     
                     embedding_forward_quantized_host_cpu.cpp
                     
                     embedding_forward_quantized_split_lookup.cu
                     
                     embedding_forward_quantized_split_nbit_host_template.cu
                     
                     embedding_forward_quantized_split_nbit_kernel_template.cu
                     
                     embedding_forward_split_cpu.cpp
                     
                     embedding_forward_split_cpu.h
                     
                     embedding_forward_split_kernel_nobag_small_template.cu
                     
                     embedding_forward_split_kernel_template.cu
                     
                     embedding_forward_split_kernel_v2_template.cu
                     
                     embedding_forward_split_meta_template.cpp
                     
                     embedding_forward_split_template.cu
                     
                     embedding_forward_template_helpers.cuh
                     
                     embedding_op_registration.h
                     
                     embedding_ops_placeholder.cpp
                     
                     embedding_optimizer_split_device_kernel_template.cuh
                     
                     embedding_optimizer_split_host_template.cpp
                     
                     embedding_optimizer_split_kernel_template.cu
                     
                     embedding_optimizer_split_template.cu
                     
                    diff --git a/dir_02a03557abfde8453507651f5e287abe.html b/dir_02a03557abfde8453507651f5e287abe.html index a18bf048c..048b24004 100644 --- a/dir_02a03557abfde8453507651f5e287abe.html +++ b/dir_02a03557abfde8453507651f5e287abe.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/merge_pooled_embedding_ops Directory Reference + + @@ -29,7 +31,7 @@ - + @@ -76,10 +78,18 @@
                    merge_pooled_embedding_ops Directory Reference
                    diff --git a/dir_0948881d7cc927e01ea6d36a3aab1e2e.html b/dir_0948881d7cc927e01ea6d36a3aab1e2e.html index f84acbd57..1e43ca919 100644 --- a/dir_0948881d7cc927e01ea6d36a3aab1e2e.html +++ b/dir_0948881d7cc927e01ea6d36a3aab1e2e.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_utils Directory Reference + + @@ -29,7 +31,7 @@ - + @@ -76,10 +78,24 @@
                    split_embeddings_utils Directory Reference
                    diff --git a/dir_13e138d54eb8818da29c3992edef070a.html b/dir_13e138d54eb8818da29c3992edef070a.html new file mode 100644 index 000000000..e0d9c02c6 --- /dev/null +++ b/dir_13e138d54eb8818da29c3992edef070a.html @@ -0,0 +1,99 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/test Directory Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    test Directory Reference
                    +
                    + + + + + diff --git a/dir_276218242e4c9e66d5a5475a5ec0acdc.html b/dir_276218242e4c9e66d5a5475a5ec0acdc.html index 121020ebd..fa631c91e 100644 --- a/dir_276218242e4c9e66d5a5475a5ec0acdc.html +++ b/dir_276218242e4c9e66d5a5475a5ec0acdc.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/metric_ops Directory Reference + + @@ -29,7 +31,7 @@ - + @@ -76,10 +78,20 @@
                    metric_ops Directory Reference
                    + + + + + + + + +

                    +Files

                     metric_ops.cu
                     
                     metric_ops.h
                     
                     metric_ops_host.cpp
                     
                    diff --git a/dir_47b3f8f6a06f015d543fc51782f25cbc.html b/dir_47b3f8f6a06f015d543fc51782f25cbc.html index b1ff3c3ec..1e230dda8 100644 --- a/dir_47b3f8f6a06f015d543fc51782f25cbc.html +++ b/dir_47b3f8f6a06f015d543fc51782f25cbc.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/input_combine_ops Directory Reference + + @@ -29,7 +31,7 @@ - + @@ -76,10 +78,20 @@
                    input_combine_ops Directory Reference
                    diff --git a/dir_4b83c65efe436c76bd5bbbb817afaf6c.html b/dir_4b83c65efe436c76bd5bbbb817afaf6c.html index d4254ae47..095250bff 100644 --- a/dir_4b83c65efe436c76bd5bbbb817afaf6c.html +++ b/dir_4b83c65efe436c76bd5bbbb817afaf6c.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/memory_utils Directory Reference + + @@ -29,7 +31,7 @@ - + @@ -76,10 +78,28 @@
                    memory_utils Directory Reference
                    diff --git a/dir_4ba5c3fb534fa6dc09bb4e43398a4fa2.html b/dir_4ba5c3fb534fa6dc09bb4e43398a4fa2.html index cf2129abb..33bf0d131 100644 --- a/dir_4ba5c3fb534fa6dc09bb4e43398a4fa2.html +++ b/dir_4ba5c3fb534fa6dc09bb4e43398a4fa2.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/ssd_split_embeddings_cache Directory Reference + + @@ -29,7 +31,7 @@ - + @@ -76,10 +78,20 @@
                    ssd_split_embeddings_cache Directory Reference
                    diff --git a/dir_5d9ed08f5e7f3c5fee3a750ceaf7305f.html b/dir_5d9ed08f5e7f3c5fee3a750ceaf7305f.html index 259828557..d30f73c7b 100644 --- a/dir_5d9ed08f5e7f3c5fee3a750ceaf7305f.html +++ b/dir_5d9ed08f5e7f3c5fee3a750ceaf7305f.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/permute_pooled_embedding_ops Directory Reference + + @@ -29,7 +31,7 @@ - + @@ -76,10 +78,28 @@
                    permute_pooled_embedding_ops Directory Reference
                    diff --git a/dir_5f55f9fa3600c80e31b55cfa7be0ede8.html b/dir_5f55f9fa3600c80e31b55cfa7be0ede8.html new file mode 100644 index 000000000..1e95cadd9 --- /dev/null +++ b/dir_5f55f9fa3600c80e31b55cfa7be0ede8.html @@ -0,0 +1,95 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/CMakeFiles/3.28.1 Directory Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    3.28.1 Directory Reference
                    +
                    +
                    + + + + + + +

                    +Directories

                     CompilerIdC
                     
                     CompilerIdCXX
                     
                    +
                    + + + + diff --git a/dir_68267d1309a1af8e8297ef4c3efbcdba.html b/dir_68267d1309a1af8e8297ef4c3efbcdba.html index 9548a1cfb..a005ef73b 100644 --- a/dir_68267d1309a1af8e8297ef4c3efbcdba.html +++ b/dir_68267d1309a1af8e8297ef4c3efbcdba.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src Directory Reference + + @@ -29,7 +31,7 @@ - + @@ -105,11 +107,18 @@    ssd_split_embeddings_cache   + + + + + +

                    +Files

                     histogram_binning_calibration_ops.cu
                     
                     topology_utils.cpp
                     
                    diff --git a/dir_7171c7990335cc008eec7387f12fe0ea.html b/dir_7171c7990335cc008eec7387f12fe0ea.html index 81413c0a4..26373ad71 100644 --- a/dir_7171c7990335cc008eec7387f12fe0ea.html +++ b/dir_7171c7990335cc008eec7387f12fe0ea.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/embedding_inplace_ops Directory Reference + + @@ -29,7 +31,7 @@ - + @@ -76,10 +78,22 @@
                    embedding_inplace_ops Directory Reference
                    diff --git a/dir_7caac3cc36f516c287d0977dc87384a8.html b/dir_7caac3cc36f516c287d0977dc87384a8.html index 11695890a..2caf07a07 100644 --- a/dir_7caac3cc36f516c287d0977dc87384a8.html +++ b/dir_7caac3cc36f516c287d0977dc87384a8.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/quantize_ops Directory Reference + + @@ -29,7 +31,7 @@ - + @@ -76,10 +78,36 @@
                    quantize_ops Directory Reference
                    diff --git a/dir_7ce412f9e32e10e58164510708821927.html b/dir_7ce412f9e32e10e58164510708821927.html index f6874a3c8..1db14c209 100644 --- a/dir_7ce412f9e32e10e58164510708821927.html +++ b/dir_7ce412f9e32e10e58164510708821927.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_cache Directory Reference + + @@ -29,7 +31,7 @@ - + @@ -76,10 +78,48 @@
                    split_embeddings_cache Directory Reference
                    diff --git a/dir_828e33ae11ea9ec04ffe6e59c52eef6d.html b/dir_828e33ae11ea9ec04ffe6e59c52eef6d.html index ec7ff4afc..17025861e 100644 --- a/dir_828e33ae11ea9ec04ffe6e59c52eef6d.html +++ b/dir_828e33ae11ea9ec04ffe6e59c52eef6d.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops Directory Reference + + @@ -29,7 +31,7 @@ - + @@ -76,10 +78,62 @@
                    sparse_ops Directory Reference
                    diff --git a/dir_897ef76b26d94e0feb8fb6e0621cd742.html b/dir_897ef76b26d94e0feb8fb6e0621cd742.html index c23b4b1dd..84a956220 100644 --- a/dir_897ef76b26d94e0feb8fb6e0621cd742.html +++ b/dir_897ef76b26d94e0feb8fb6e0621cd742.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/layout_transform_ops Directory Reference + + @@ -29,7 +31,7 @@ - + @@ -76,10 +78,20 @@
                    layout_transform_ops Directory Reference
                    diff --git a/dir_a27d41c4018669c20f452802c44efb2d.html b/dir_a27d41c4018669c20f452802c44efb2d.html new file mode 100644 index 000000000..098d83b1e --- /dev/null +++ b/dir_a27d41c4018669c20f452802c44efb2d.html @@ -0,0 +1,93 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12 Directory Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    linux-x86_64-3.12 Directory Reference
                    +
                    +
                    + + + + +

                    +Directories

                     cmake-build
                     
                    +
                    + + + + diff --git a/dir_a36c4719283424f51e58ca3678e5dea3.html b/dir_a36c4719283424f51e58ca3678e5dea3.html new file mode 100644 index 000000000..e90092224 --- /dev/null +++ b/dir_a36c4719283424f51e58ca3678e5dea3.html @@ -0,0 +1,93 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/bench Directory Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    bench Directory Reference
                    +
                    + + + + + diff --git a/dir_a88d368584008a90df396d91e5b8b095.html b/dir_a88d368584008a90df396d91e5b8b095.html index 6e2f3eb17..4002ae48d 100644 --- a/dir_a88d368584008a90df396d91e5b8b095.html +++ b/dir_a88d368584008a90df396d91e5b8b095.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops Directory Reference + + @@ -29,7 +31,7 @@ - + @@ -76,10 +78,60 @@
                    jagged_tensor_ops Directory Reference
                    diff --git a/dir_ae8e6ef04f6eeb9549906760d0097e6e.html b/dir_ae8e6ef04f6eeb9549906760d0097e6e.html new file mode 100644 index 000000000..b1994b73a --- /dev/null +++ b/dir_ae8e6ef04f6eeb9549906760d0097e6e.html @@ -0,0 +1,93 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/CMakeFiles Directory Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    CMakeFiles Directory Reference
                    +
                    +
                    + + + + +

                    +Directories

                     3.28.1
                     
                    +
                    + + + + diff --git a/dir_b4b8bd075f03e0fff4167d5f80e92046.html b/dir_b4b8bd075f03e0fff4167d5f80e92046.html new file mode 100644 index 000000000..819e44827 --- /dev/null +++ b/dir_b4b8bd075f03e0fff4167d5f80e92046.html @@ -0,0 +1,93 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild Directory Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    _skbuild Directory Reference
                    +
                    +
                    + + + + +

                    +Directories

                     linux-x86_64-3.12
                     
                    +
                    + + + + diff --git a/dir_cafe9c3a34c8f467f9ca81fe4c33c741.html b/dir_cafe9c3a34c8f467f9ca81fe4c33c741.html index 5c99d6ec3..b699faac4 100644 --- a/dir_cafe9c3a34c8f467f9ca81fe4c33c741.html +++ b/dir_cafe9c3a34c8f467f9ca81fe4c33c741.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu Directory Reference + + @@ -29,7 +31,7 @@ - + @@ -76,10 +78,72 @@
                    fbgemm_gpu Directory Reference
                    diff --git a/dir_d42b091ea9351334e82212d21cbafb15.html b/dir_d42b091ea9351334e82212d21cbafb15.html new file mode 100644 index 000000000..9a00f8981 --- /dev/null +++ b/dir_d42b091ea9351334e82212d21cbafb15.html @@ -0,0 +1,600 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build Directory Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    cmake-build Directory Reference
                    +
                    +
                    + + + + +

                    +Directories

                     CMakeFiles
                     
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

                    +Files

                     gen_batch_index_select_dim0_backward_codegen_cuda.cu
                     
                     gen_batch_index_select_dim0_backward_kernel_cta.cu
                     
                     gen_batch_index_select_dim0_backward_kernel_warp.cu
                     
                     gen_batch_index_select_dim0_forward_codegen_cuda.cu
                     
                     gen_batch_index_select_dim0_forward_kernel.cu
                     
                     gen_batch_index_select_dim0_forward_kernel_small.cu
                     
                     gen_embedding_backward_adagrad_split_cpu.cpp
                     
                     gen_embedding_backward_adagrad_split_unweighted_cuda.cu
                     
                     gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu
                     
                     gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu
                     
                     gen_embedding_backward_adagrad_split_unweighted_nobag_cuda.cu
                     
                     gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu
                     
                     gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu
                     
                     gen_embedding_backward_adagrad_split_weighted_cuda.cu
                     
                     gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu
                     
                     gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu
                     
                     gen_embedding_backward_adam_split_unweighted_cuda.cu
                     
                     gen_embedding_backward_adam_split_unweighted_kernel_cta.cu
                     
                     gen_embedding_backward_adam_split_unweighted_kernel_warp.cu
                     
                     gen_embedding_backward_adam_split_unweighted_nobag_cuda.cu
                     
                     gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu
                     
                     gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu
                     
                     gen_embedding_backward_adam_split_weighted_cuda.cu
                     
                     gen_embedding_backward_adam_split_weighted_kernel_cta.cu
                     
                     gen_embedding_backward_adam_split_weighted_kernel_warp.cu
                     
                     gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu
                     
                     gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu
                     
                     gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu
                     
                     gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu
                     
                     gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu
                     
                     gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu
                     
                     gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu
                     
                     gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu
                     
                     gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu
                     
                     gen_embedding_backward_dense_indice_weights_codegen_cuda.cu
                     
                     gen_embedding_backward_dense_split_cpu.cpp
                     
                     gen_embedding_backward_dense_split_unweighted_cuda.cu
                     
                     gen_embedding_backward_dense_split_unweighted_kernel_cta.cu
                     
                     gen_embedding_backward_dense_split_unweighted_kernel_warp.cu
                     
                     gen_embedding_backward_dense_split_unweighted_nobag_cuda.cu
                     
                     gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu
                     
                     gen_embedding_backward_dense_split_unweighted_nobag_kernel_warp.cu
                     
                     gen_embedding_backward_dense_split_weighted_cuda.cu
                     
                     gen_embedding_backward_dense_split_weighted_kernel_cta.cu
                     
                     gen_embedding_backward_dense_split_weighted_kernel_warp.cu
                     
                     gen_embedding_backward_lamb_split_unweighted_cuda.cu
                     
                     gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu
                     
                     gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu
                     
                     gen_embedding_backward_lamb_split_unweighted_nobag_cuda.cu
                     
                     gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu
                     
                     gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu
                     
                     gen_embedding_backward_lamb_split_weighted_cuda.cu
                     
                     gen_embedding_backward_lamb_split_weighted_kernel_cta.cu
                     
                     gen_embedding_backward_lamb_split_weighted_kernel_warp.cu
                     
                     gen_embedding_backward_lars_sgd_split_unweighted_cuda.cu
                     
                     gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu
                     
                     gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu
                     
                     gen_embedding_backward_lars_sgd_split_unweighted_nobag_cuda.cu
                     
                     gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu
                     
                     gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu
                     
                     gen_embedding_backward_lars_sgd_split_weighted_cuda.cu
                     
                     gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu
                     
                     gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu
                     
                     gen_embedding_backward_none_split_unweighted_cuda.cu
                     
                     gen_embedding_backward_none_split_unweighted_kernel_cta.cu
                     
                     gen_embedding_backward_none_split_unweighted_kernel_warp.cu
                     
                     gen_embedding_backward_none_split_unweighted_nobag_cuda.cu
                     
                     gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu
                     
                     gen_embedding_backward_none_split_unweighted_nobag_kernel_warp.cu
                     
                     gen_embedding_backward_none_split_weighted_cuda.cu
                     
                     gen_embedding_backward_none_split_weighted_kernel_cta.cu
                     
                     gen_embedding_backward_none_split_weighted_kernel_warp.cu
                     
                     gen_embedding_backward_partial_rowwise_adam_split_unweighted_cuda.cu
                     
                     gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu
                     
                     gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu
                     
                     gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_cuda.cu
                     
                     gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu
                     
                     gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu
                     
                     gen_embedding_backward_partial_rowwise_adam_split_weighted_cuda.cu
                     
                     gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu
                     
                     gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu
                     
                     gen_embedding_backward_partial_rowwise_lamb_split_unweighted_cuda.cu
                     
                     gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu
                     
                     gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu
                     
                     gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_cuda.cu
                     
                     gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu
                     
                     gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu
                     
                     gen_embedding_backward_partial_rowwise_lamb_split_weighted_cuda.cu
                     
                     gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu
                     
                     gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu
                     
                     gen_embedding_backward_rowwise_adagrad_split_cpu.cpp
                     
                     gen_embedding_backward_rowwise_adagrad_split_unweighted_cuda.cu
                     
                     gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu
                     
                     gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu
                     
                     gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_cuda.cu
                     
                     gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu
                     
                     gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu
                     
                     gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_cuda.cu
                     
                     gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu
                     
                     gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu
                     
                     gen_embedding_backward_rowwise_adagrad_split_weighted_cuda.cu
                     
                     gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu
                     
                     gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu
                     
                     gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_cuda.cu
                     
                     gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu
                     
                     gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu
                     
                     gen_embedding_backward_rowwise_adagrad_with_counter_split_cpu.cpp
                     
                     gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_cuda.cu
                     
                     gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu
                     
                     gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu
                     
                     gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_cuda.cu
                     
                     gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu
                     
                     gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu
                     
                     gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_cuda.cu
                     
                     gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu
                     
                     gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu
                     
                     gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu
                     
                     gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu
                     
                     gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu
                     
                     gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu
                     
                     gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu
                     
                     gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu
                     
                     gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu
                     
                     gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu
                     
                     gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu
                     
                     gen_embedding_backward_rowwise_weighted_adagrad_split_cpu.cpp
                     
                     gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_cuda.cu
                     
                     gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu
                     
                     gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu
                     
                     gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_cuda.cu
                     
                     gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu
                     
                     gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu
                     
                     gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_cuda.cu
                     
                     gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu
                     
                     gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu
                     
                     gen_embedding_backward_sgd_split_cpu.cpp
                     
                     gen_embedding_backward_sgd_split_unweighted_cuda.cu
                     
                     gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu
                     
                     gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu
                     
                     gen_embedding_backward_sgd_split_unweighted_nobag_cuda.cu
                     
                     gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu
                     
                     gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu
                     
                     gen_embedding_backward_sgd_split_unweighted_vbe_cuda.cu
                     
                     gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu
                     
                     gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu
                     
                     gen_embedding_backward_sgd_split_weighted_cuda.cu
                     
                     gen_embedding_backward_sgd_split_weighted_kernel_cta.cu
                     
                     gen_embedding_backward_sgd_split_weighted_kernel_warp.cu
                     
                     gen_embedding_backward_sgd_split_weighted_vbe_cuda.cu
                     
                     gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu
                     
                     gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu
                     
                     gen_embedding_backward_split_adagrad.cpp
                     
                     gen_embedding_backward_split_adagrad_cpu.cpp
                     
                     gen_embedding_backward_split_adam.cpp
                     
                     gen_embedding_backward_split_adam_cpu.cpp
                     
                     gen_embedding_backward_split_approx_rowwise_adagrad.cpp
                     
                     gen_embedding_backward_split_approx_rowwise_adagrad_cpu.cpp
                     
                     gen_embedding_backward_split_approx_rowwise_adagrad_with_counter.cpp
                     
                     gen_embedding_backward_split_approx_rowwise_adagrad_with_counter_cpu.cpp
                     
                     gen_embedding_backward_split_approx_rowwise_adagrad_with_weight_decay.cpp
                     
                     gen_embedding_backward_split_approx_rowwise_adagrad_with_weight_decay_cpu.cpp
                     
                     gen_embedding_backward_split_approx_sgd.cpp
                     
                     gen_embedding_backward_split_approx_sgd_cpu.cpp
                     
                     gen_embedding_backward_split_grad.cu
                     
                     gen_embedding_backward_split_indice_weights_codegen_cuda.cu
                     
                     gen_embedding_backward_split_lamb.cpp
                     
                     gen_embedding_backward_split_lamb_cpu.cpp
                     
                     gen_embedding_backward_split_lars_sgd.cpp
                     
                     gen_embedding_backward_split_lars_sgd_cpu.cpp
                     
                     gen_embedding_backward_split_none.cpp
                     
                     gen_embedding_backward_split_none_cpu.cpp
                     
                     gen_embedding_backward_split_partial_rowwise_adam.cpp
                     
                     gen_embedding_backward_split_partial_rowwise_adam_cpu.cpp
                     
                     gen_embedding_backward_split_partial_rowwise_lamb.cpp
                     
                     gen_embedding_backward_split_partial_rowwise_lamb_cpu.cpp
                     
                     gen_embedding_backward_split_rowwise_adagrad.cpp
                     
                     gen_embedding_backward_split_rowwise_adagrad_cpu.cpp
                     
                     gen_embedding_backward_split_rowwise_adagrad_with_counter.cpp
                     
                     gen_embedding_backward_split_rowwise_adagrad_with_counter_cpu.cpp
                     
                     gen_embedding_backward_split_rowwise_adagrad_with_weight_decay.cpp
                     
                     gen_embedding_backward_split_rowwise_adagrad_with_weight_decay_cpu.cpp
                     
                     gen_embedding_backward_split_rowwise_weighted_adagrad.cpp
                     
                     gen_embedding_backward_split_rowwise_weighted_adagrad_cpu.cpp
                     
                     gen_embedding_backward_split_sgd.cpp
                     
                     gen_embedding_backward_split_sgd_cpu.cpp
                     
                     gen_embedding_forward_dense_unweighted_codegen_cuda.cu
                     
                     gen_embedding_forward_dense_unweighted_codegen_meta.cpp
                     
                     gen_embedding_forward_dense_unweighted_kernel.cu
                     
                     gen_embedding_forward_dense_unweighted_nobag_kernel.cu
                     
                     gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu
                     
                     gen_embedding_forward_dense_weighted_codegen_cuda.cu
                     
                     gen_embedding_forward_dense_weighted_codegen_meta.cpp
                     
                     gen_embedding_forward_dense_weighted_kernel.cu
                     
                     gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_kernel_unweighted_fp16_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_kernel_unweighted_fp32_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_kernel_unweighted_fp8_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_kernel_unweighted_int2_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_kernel_unweighted_int4_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_kernel_unweighted_int8_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_fp16_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_fp32_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_fp8_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_int2_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_int4_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_int8_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_kernel_weighted_fp16_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_kernel_weighted_fp32_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_kernel_weighted_fp8_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_kernel_weighted_int2_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_kernel_weighted_int4_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_split_nbit_kernel_weighted_int8_codegen_cuda.cu
                     
                     gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp
                     
                     gen_embedding_forward_quantized_weighted_codegen_cpu.cpp
                     
                     gen_embedding_forward_split_unweighted_codegen_cuda.cu
                     
                     gen_embedding_forward_split_unweighted_codegen_meta.cpp
                     
                     gen_embedding_forward_split_unweighted_kernel.cu
                     
                     gen_embedding_forward_split_unweighted_nobag_kernel.cu
                     
                     gen_embedding_forward_split_unweighted_nobag_kernel_small.cu
                     
                     gen_embedding_forward_split_unweighted_v2_kernel.cu
                     
                     gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu
                     
                     gen_embedding_forward_split_unweighted_vbe_codegen_meta.cpp
                     
                     gen_embedding_forward_split_unweighted_vbe_kernel.cu
                     
                     gen_embedding_forward_split_weighted_codegen_cuda.cu
                     
                     gen_embedding_forward_split_weighted_codegen_meta.cpp
                     
                     gen_embedding_forward_split_weighted_kernel.cu
                     
                     gen_embedding_forward_split_weighted_v2_kernel.cu
                     
                     gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu
                     
                     gen_embedding_forward_split_weighted_vbe_codegen_meta.cpp
                     
                     gen_embedding_forward_split_weighted_vbe_kernel.cu
                     
                     gen_embedding_optimizer_adagrad_split_device_kernel.cuh
                     
                     gen_embedding_optimizer_adam_split_device_kernel.cuh
                     
                     gen_embedding_optimizer_approx_rowwise_adagrad_split_device_kernel.cuh
                     
                     gen_embedding_optimizer_approx_rowwise_adagrad_with_counter_split_device_kernel.cuh
                     
                     gen_embedding_optimizer_approx_rowwise_adagrad_with_weight_decay_split_device_kernel.cuh
                     
                     gen_embedding_optimizer_approx_sgd_split_device_kernel.cuh
                     
                     gen_embedding_optimizer_dense_split_device_kernel.cuh
                     
                     gen_embedding_optimizer_lamb_split_device_kernel.cuh
                     
                     gen_embedding_optimizer_lars_sgd_split_device_kernel.cuh
                     
                     gen_embedding_optimizer_none_split_device_kernel.cuh
                     
                     gen_embedding_optimizer_partial_rowwise_adam_split_device_kernel.cuh
                     
                     gen_embedding_optimizer_partial_rowwise_lamb_split_device_kernel.cuh
                     
                     gen_embedding_optimizer_rowwise_adagrad_split.cpp
                     
                     gen_embedding_optimizer_rowwise_adagrad_split_cuda.cu
                     
                     gen_embedding_optimizer_rowwise_adagrad_split_device_kernel.cuh
                     
                     gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu
                     
                     gen_embedding_optimizer_rowwise_adagrad_with_counter_split_device_kernel.cuh
                     
                     gen_embedding_optimizer_rowwise_adagrad_with_weight_decay_split_device_kernel.cuh
                     
                     gen_embedding_optimizer_rowwise_weighted_adagrad_split_device_kernel.cuh
                     
                     gen_embedding_optimizer_sgd_split_device_kernel.cuh
                     
                    +
                    + + + + diff --git a/dir_d44c64559bbebec7f509842c48db8b23.html b/dir_d44c64559bbebec7f509842c48db8b23.html index 66168594b..193507815 100644 --- a/dir_d44c64559bbebec7f509842c48db8b23.html +++ b/dir_d44c64559bbebec7f509842c48db8b23.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include Directory Reference + + @@ -29,7 +31,7 @@ - + @@ -85,7 +87,7 @@ diff --git a/dir_d8fa031c2715d8d52539c7e4d4cc6d73.html b/dir_d8fa031c2715d8d52539c7e4d4cc6d73.html new file mode 100644 index 000000000..519114c5b --- /dev/null +++ b/dir_d8fa031c2715d8d52539c7e4d4cc6d73.html @@ -0,0 +1,93 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/CMakeFiles/3.28.1/CompilerIdCXX Directory Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    CompilerIdCXX Directory Reference
                    +
                    +
                    + + + + +

                    +Files

                     CMakeCXXCompilerId.cpp
                     
                    +
                    + + + + diff --git a/dir_dfdf575eb5c21ea09ad9fb656efb7738.html b/dir_dfdf575eb5c21ea09ad9fb656efb7738.html new file mode 100644 index 000000000..62edfb6b2 --- /dev/null +++ b/dir_dfdf575eb5c21ea09ad9fb656efb7738.html @@ -0,0 +1,93 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/CMakeFiles/3.28.1/CompilerIdC Directory Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    CompilerIdC Directory Reference
                    +
                    +
                    + + + + +

                    +Files

                     CMakeCCompilerId.c
                     
                    +
                    + + + + diff --git a/dispatch__macros_8h.html b/dispatch__macros_8h.html new file mode 100644 index 000000000..d6ea0d315 --- /dev/null +++ b/dispatch__macros_8h.html @@ -0,0 +1,621 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/dispatch_macros.h File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    dispatch_macros.h File Reference
                    +
                    +
                    +
                    #include <torch/library.h>
                    +

                    Macro Definition Documentation

                    + +

                    ◆ _DISPATCH_EMB_CACHE_TYPES

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + +
                    #define _DISPATCH_EMB_CACHE_TYPES( emb_enum_type,
                    cache_enum_type,
                    NAME,
                    ... )
                    +
                    +Value:
                    at::ScalarType _emb_t = ::detail::scalar_type(emb_enum_type); \
                    +
                    at::ScalarType _cache_t = ::detail::scalar_type(cache_enum_type); \
                    +
                    switch (_emb_t) { \
                    +
                    PRIVATE_CASE_TYPE_EMB( \
                    +
                    at::ScalarType::Byte, _cache_t, uint8_t, NAME, __VA_ARGS__) \
                    +
                    PRIVATE_CASE_TYPE_EMB( \
                    +
                    at::ScalarType::Float, _cache_t, float, NAME, __VA_ARGS__) \
                    +
                    PRIVATE_CASE_TYPE_EMB( \
                    +
                    at::ScalarType::Half, _cache_t, at::Half, NAME, __VA_ARGS__) \
                    +
                    default: \
                    +
                    AT_ERROR(#NAME, " not implemented for emb_t '", toString(_emb_t), "'"); \
                    +
                    }
                    +
                    template uint8_t
                    Definition gen_batch_index_select_dim0_forward_kernel.cu:1240
                    +
                    +
                    +
                    + +

                    ◆ DISPATCH_EMB_CACHE_OUTPUT_TYPES

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define DISPATCH_EMB_CACHE_OUTPUT_TYPES( EMB_TYPE,
                    CACHE_TYPE,
                    OUTPUT_TYPE,
                    NAME,
                    ... )
                    +
                    + +
                    +
                    + +

                    ◆ DISPATCH_EMB_CACHE_TYPES

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + +
                    #define DISPATCH_EMB_CACHE_TYPES( EMB_TYPE,
                    CACHE_TYPE,
                    NAME,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    const auto& emb_type = EMB_TYPE; \
                    +
                    const auto& cache_type = CACHE_TYPE; \
                    +
                    _DISPATCH_EMB_CACHE_TYPES(emb_type, cache_type, NAME, __VA_ARGS__) \
                    +
                    }()
                    +
                    +
                    +
                    + +

                    ◆ DISPATCH_EMB_GRAD_CACHE_TYPES

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define DISPATCH_EMB_GRAD_CACHE_TYPES( EMB_TYPE,
                    GRAD_TYPE,
                    CACHE_TYPE,
                    NAME,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    const auto& emb_type = EMB_TYPE; \
                    +
                    const auto& grad_type = GRAD_TYPE; \
                    +
                    const auto& cache_type = CACHE_TYPE; \
                    +
                    at::ScalarType _emb_t = ::detail::scalar_type(emb_type); \
                    +
                    at::ScalarType _grad_t = ::detail::scalar_type(grad_type); \
                    +
                    at::ScalarType _cache_t = ::detail::scalar_type(cache_type); \
                    +
                    switch (_grad_t) { \
                    +
                    PRIVATE_CASE_TYPE_CACHE_EMB( \
                    +
                    at::ScalarType::Float, _cache_t, _emb_t, float, NAME, __VA_ARGS__) \
                    +
                    PRIVATE_CASE_TYPE_CACHE_EMB( \
                    +
                    at::ScalarType::Half, _cache_t, _emb_t, at::Half, NAME, __VA_ARGS__) \
                    +
                    default: \
                    +
                    AT_ERROR( \
                    +
                    #NAME, " not implemented for grad_t '", toString(_grad_t), "'"); \
                    +
                    } \
                    +
                    }()
                    +
                    +
                    +
                    + +

                    ◆ DISPATCH_OUTPUT_TYPES

                    + +
                    +
                    + + + + + + + + + + + + + + + + +
                    #define DISPATCH_OUTPUT_TYPES( OUTPUT_TYPE,
                    NAME,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    const auto& output_type = OUTPUT_TYPE; \
                    +
                    at::ScalarType _output_t = ::detail::scalar_type(output_type); \
                    +
                    switch (_output_t) { \
                    +
                    PRIVATE_CASE_TYPE_OUTPUT2(at::ScalarType::Half, at::Half, __VA_ARGS__) \
                    +
                    PRIVATE_CASE_TYPE_OUTPUT2( \
                    +
                    at::ScalarType::BFloat16, at::BFloat16, __VA_ARGS__) \
                    +
                    PRIVATE_CASE_TYPE_OUTPUT2(at::ScalarType::Float, float, __VA_ARGS__) \
                    +
                    PRIVATE_CASE_TYPE_OUTPUT2(at::ScalarType::Byte, uint8_t, __VA_ARGS__) \
                    +
                    default: \
                    +
                    AT_ERROR( \
                    +
                    #NAME, \
                    +
                    " not implemented for output_t '", \
                    +
                    toString(_output_t), \
                    +
                    "'"); \
                    +
                    } \
                    +
                    }()
                    +
                    +
                    +
                    + +

                    ◆ FBGEMM_DISPATCH_FLOAT_AND_HALF

                    + +
                    +
                    + + + + + + + + + + + + + + + + +
                    #define FBGEMM_DISPATCH_FLOAT_AND_HALF( TYPE,
                    NAME,
                    ... )
                    +
                    +Value:
                    AT_DISPATCH_SWITCH( \
                    +
                    TYPE, NAME, FBGEMM_DISPATCH_FLOAT_AND_HALF_CASE(__VA_ARGS__))
                    +
                    #define FBGEMM_DISPATCH_FLOAT_AND_HALF_CASE(...)
                    Definition dispatch_macros.h:192
                    +
                    +
                    +
                    + +

                    ◆ FBGEMM_DISPATCH_FLOAT_AND_HALF_CASE

                    + +
                    +
                    + + + + + + + +
                    #define FBGEMM_DISPATCH_FLOAT_AND_HALF_CASE( ...)
                    +
                    +Value:
                    AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
                    +
                    AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
                    +
                    +
                    +
                    + +

                    ◆ FBGEMM_DISPATCH_FLOAT_HALF_AND_BFLOAT16

                    + +
                    +
                    + + + + + + + + + + + + + + + + +
                    #define FBGEMM_DISPATCH_FLOAT_HALF_AND_BFLOAT16( TYPE,
                    NAME,
                    ... )
                    +
                    +Value:
                    AT_DISPATCH_SWITCH( \
                    + +
                    #define FBGEMM_DISPATCH_FLOAT_HALF_AND_BFLOAT16_CASE(...)
                    Definition dispatch_macros.h:196
                    +
                    +
                    +
                    + +

                    ◆ FBGEMM_DISPATCH_FLOAT_HALF_AND_BFLOAT16_CASE

                    + +
                    +
                    + + + + + + + +
                    #define FBGEMM_DISPATCH_FLOAT_HALF_AND_BFLOAT16_CASE( ...)
                    +
                    +Value:
                    +
                    AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
                    +
                    +
                    +
                    + +

                    ◆ PRIVATE_CASE_TYPE_CACHE

                    + +
                    +
                    + + + + + + + + + + + + + + + + +
                    #define PRIVATE_CASE_TYPE_CACHE( enum_type,
                    type,
                    ... )
                    +
                    +Value:
                    case enum_type: { \
                    +
                    using cache_t = type; \
                    +
                    return __VA_ARGS__(); \
                    +
                    }
                    +
                    +
                    +
                    + +

                    ◆ PRIVATE_CASE_TYPE_CACHE_EMB

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define PRIVATE_CASE_TYPE_CACHE_EMB( grad_enum_type,
                    _cache_t,
                    _emb_t,
                    grad_cxx_type,
                    NAME,
                    ... )
                    +
                    +Value:
                    case grad_enum_type: { \
                    +
                    using grad_t = grad_cxx_type; \
                    +
                    switch (_emb_t) { \
                    +
                    PRIVATE_CASE_TYPE_EMB( \
                    +
                    at::ScalarType::Byte, _cache_t, uint8_t, NAME, __VA_ARGS__) \
                    +
                    PRIVATE_CASE_TYPE_EMB( \
                    +
                    at::ScalarType::Float, _cache_t, float, NAME, __VA_ARGS__) \
                    +
                    PRIVATE_CASE_TYPE_EMB( \
                    +
                    at::ScalarType::Half, _cache_t, at::Half, NAME, __VA_ARGS__) \
                    +
                    default: \
                    +
                    AT_ERROR( \
                    +
                    #NAME, " not implemented for emb_t '", toString(_emb_t), "'"); \
                    +
                    } \
                    +
                    }
                    +
                    +
                    +
                    + +

                    ◆ PRIVATE_CASE_TYPE_EMB

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define PRIVATE_CASE_TYPE_EMB( enum_type1,
                    enum_type2,
                    type1,
                    NAME,
                    ... )
                    +
                    +Value:
                    case enum_type1: { \
                    +
                    using emb_t = type1; \
                    +
                    switch (enum_type2) { \
                    +
                    PRIVATE_CASE_TYPE_CACHE(at::ScalarType::Float, float, __VA_ARGS__) \
                    +
                    PRIVATE_CASE_TYPE_CACHE(at::ScalarType::Half, at::Half, __VA_ARGS__) \
                    +
                    default: \
                    +
                    AT_ERROR( \
                    +
                    #NAME, \
                    +
                    " not implemented for cache_t '", \
                    +
                    toString(enum_type2), \
                    +
                    "'"); \
                    +
                    } \
                    +
                    }
                    +
                    +
                    +
                    + +

                    ◆ PRIVATE_CASE_TYPE_OUTPUT

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define PRIVATE_CASE_TYPE_OUTPUT( output_enum_type1,
                    emb_enum_type1,
                    cache_enum_type1,
                    output_type1,
                    NAME,
                    ... )
                    +
                    +Value:
                    case output_enum_type1: { \
                    +
                    using output_t = output_type1; \
                    +
                    _DISPATCH_EMB_CACHE_TYPES( \
                    +
                    emb_enum_type1, cache_enum_type1, NAME, __VA_ARGS__) \
                    +
                    }
                    +
                    +
                    +
                    + +

                    ◆ PRIVATE_CASE_TYPE_OUTPUT2

                    + +
                    +
                    + + + + + + + + + + + + + + + + +
                    #define PRIVATE_CASE_TYPE_OUTPUT2( enum_type,
                    type,
                    ... )
                    +
                    +Value:
                    case enum_type: { \
                    +
                    using output_t = type; \
                    +
                    return __VA_ARGS__(); \
                    +
                    }
                    +
                    +
                    +
                    + +

                    ◆ PT2_COMPLIANT_TAG

                    + +
                    +
                    + + + + +
                    #define PT2_COMPLIANT_TAG
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/doxygen.css b/doxygen.css index 009a9b554..7b7d851b8 100644 --- a/doxygen.css +++ b/doxygen.css @@ -1,4 +1,4 @@ -/* The standard CSS for doxygen 1.9.8*/ +/* The standard CSS for doxygen 1.10.0*/ html { /* page base colors */ @@ -145,6 +145,7 @@ html { --fragment-lineno-link-bg-color: #D8D8D8; --fragment-lineno-link-hover-fg-color: #4665A2; --fragment-lineno-link-hover-bg-color: #C8C8C8; +--fragment-copy-ok-color: #2EC82E; --tooltip-foreground-color: black; --tooltip-background-color: white; --tooltip-border-color: gray; @@ -168,6 +169,28 @@ html { --font-family-icon: Arial,Helvetica; --font-family-tooltip: Roboto,sans-serif; +/** special sections */ +--warning-color-bg: #f8d1cc; +--warning-color-hl: #b61825; +--warning-color-text: #75070f; +--note-color-bg: #faf3d8; +--note-color-hl: #f3a600; +--note-color-text: #5f4204; +--todo-color-bg: #e4f3ff; +--todo-color-hl: #1879C4; +--todo-color-text: #274a5c; +--test-color-bg: #e8e8ff; +--test-color-hl: #3939C4; +--test-color-text: #1a1a5c; +--deprecated-color-bg: #ecf0f3; +--deprecated-color-hl: #5b6269; +--deprecated-color-text: #43454a; +--bug-color-bg: #e4dafd; +--bug-color-hl: #5b2bdd; +--bug-color-text: #2a0d72; +--invariant-color-bg: #d8f1e3; +--invariant-color-hl: #44b86f; +--invariant-color-text: #265532; } @media (prefers-color-scheme: dark) { @@ -309,7 +332,7 @@ html { --code-link-color: #79C0FF; --code-external-link-color: #79C0FF; --fragment-foreground-color: #C9D1D9; ---fragment-background-color: black; +--fragment-background-color: #090D16; --fragment-border-color: #30363D; --fragment-lineno-border-color: #30363D; --fragment-lineno-background-color: black; @@ -318,6 +341,7 @@ html { --fragment-lineno-link-bg-color: #303030; --fragment-lineno-link-hover-fg-color: #8E96A1; --fragment-lineno-link-hover-bg-color: #505050; +--fragment-copy-ok-color: #0EA80E; --tooltip-foreground-color: #C9D1D9; --tooltip-background-color: #202020; --tooltip-border-color: #C9D1D9; @@ -341,6 +365,28 @@ html { --font-family-icon: Arial,Helvetica; --font-family-tooltip: Roboto,sans-serif; +/** special sections */ +--warning-color-bg: #2e1917; +--warning-color-hl: #ad2617; +--warning-color-text: #f5b1aa; +--note-color-bg: #3b2e04; +--note-color-hl: #f1b602; +--note-color-text: #ceb670; +--todo-color-bg: #163750; +--todo-color-hl: #1982D2; +--todo-color-text: #dcf0fa; +--test-color-bg: #121258; +--test-color-hl: #4242cf; +--test-color-text: #c0c0da; +--deprecated-color-bg: #2e323b; +--deprecated-color-hl: #738396; +--deprecated-color-text: #abb0bd; +--bug-color-bg: #2a2536; +--bug-color-hl: #7661b3; +--bug-color-text: #ae9ed6; +--invariant-color-bg: #303a35; +--invariant-color-hl: #76ce96; +--invariant-color-text: #cceed5; }} body { background-color: var(--page-background-color); @@ -357,8 +403,6 @@ body, table, div, p, dl { /* @group Heading Levels */ .title { - font-weight: 400; - font-size: 14px; font-family: var(--font-family-normal); line-height: 28px; font-size: 150%; @@ -556,7 +600,13 @@ a { } a:hover { - text-decoration: underline; + text-decoration: none; + background: linear-gradient(to bottom, transparent 0,transparent calc(100% - 1px), currentColor 100%); +} + +a:hover > span.arrow { + text-decoration: none; + background : var(--nav-background-color); } a.el { @@ -632,30 +682,63 @@ ul.multicol { .fragment { text-align: left; direction: ltr; - overflow-x: auto; /*Fixed: fragment lines overlap floating elements*/ + overflow-x: auto; overflow-y: hidden; + position: relative; + min-height: 12px; + margin: 10px 0px; + padding: 10px 10px; + border: 1px solid var(--fragment-border-color); + border-radius: 4px; + background-color: var(--fragment-background-color); + color: var(--fragment-foreground-color); } pre.fragment { - border: 1px solid var(--fragment-border-color); - background-color: var(--fragment-background-color); - color: var(--fragment-foreground-color); - padding: 4px 6px; - margin: 4px 8px 4px 2px; + word-wrap: break-word; + font-size: 10pt; + line-height: 125%; + font-family: var(--font-family-monospace); +} + +.clipboard { + width: 24px; + height: 24px; + right: 5px; + top: 5px; + opacity: 0; + position: absolute; + display: inline; overflow: auto; - word-wrap: break-word; - font-size: 9pt; - line-height: 125%; - font-family: var(--font-family-monospace); - font-size: 105%; + fill: var(--fragment-foreground-color); + justify-content: center; + align-items: center; + cursor: pointer; +} + +.clipboard.success { + border: 1px solid var(--fragment-foreground-color); + border-radius: 4px; +} + +.fragment:hover .clipboard, .clipboard.success { + opacity: .28; +} + +.clipboard:hover, .clipboard.success { + opacity: 1 !important; +} + +.clipboard:active:not([class~=success]) svg { + transform: scale(.91); +} + +.clipboard.success svg { + fill: var(--fragment-copy-ok-color); } -div.fragment { - padding: 0 0 1px 0; /*Fixed: last line underline overlap border*/ - margin: 4px 8px 4px 2px; - color: var(--fragment-foreground-color); - background-color: var(--fragment-background-color); - border: 1px solid var(--fragment-border-color); +.clipboard.success { + border-color: var(--fragment-copy-ok-color); } div.line { @@ -778,10 +861,6 @@ img.light-mode-visible { display: none; } -img.formulaDsp { - -} - img.formulaInl, img.inline { vertical-align: middle; } @@ -1081,17 +1160,25 @@ dl.reflist dd { .paramtype { white-space: nowrap; + padding: 0px; + padding-bottom: 1px; } .paramname { - color: var(--memdef-param-name-color); white-space: nowrap; + padding: 0px; + padding-bottom: 1px; + margin-left: 2px; } + .paramname em { + color: var(--memdef-param-name-color); font-style: normal; + margin-right: 1px; } -.paramname code { - line-height: 14px; + +.paramname .paramdefval { + font-family: var(--font-family-monospace); } .params, .retval, .exception, .tparams { @@ -1425,7 +1512,6 @@ table.fieldtable { { height:32px; display:block; - text-decoration: none; outline: none; color: var(--nav-text-normal-color); font-family: var(--font-family-nav); @@ -1514,7 +1600,8 @@ dl { padding: 0 0 0 0; } -/* dl.note, dl.warning, dl.attention, dl.pre, dl.post, dl.invariant, dl.deprecated, dl.todo, dl.test, dl.bug, dl.examples */ +/* + dl.section { margin-left: 0px; padding-left: 0px; @@ -1569,8 +1656,101 @@ dl.bug { border-color: #C08050; } +*/ + +dl.bug dt a, dl.deprecated dt a, dl.todo dt a, dl.test a { + font-weight: bold !important; +} + +dl.warning, dl.attention, dl.note, dl.deprecated, dl.bug, +dl.invariant, dl.pre, dl.post, dl.todo, dl.test, dl.remark { + padding: 10px; + margin: 10px 0px; + overflow: hidden; + margin-left: 0; + border-radius: 4px; +} + dl.section dd { - margin-bottom: 6px; + margin-bottom: 2px; +} + +dl.warning, dl.attention { + background: var(--warning-color-bg); + border-left: 8px solid var(--warning-color-hl); + color: var(--warning-color-text); +} + +dl.warning dt, dl.attention dt { + color: var(--warning-color-hl); +} + +dl.note, dl.remark { + background: var(--note-color-bg); + border-left: 8px solid var(--note-color-hl); + color: var(--note-color-text); +} + +dl.note dt, dl.remark dt { + color: var(--note-color-hl); +} + +dl.todo { + background: var(--todo-color-bg); + border-left: 8px solid var(--todo-color-hl); + color: var(--todo-color-text); +} + +dl.todo dt { + color: var(--todo-color-hl); +} + +dl.test { + background: var(--test-color-bg); + border-left: 8px solid var(--test-color-hl); + color: var(--test-color-text); +} + +dl.test dt { + color: var(--test-color-hl); +} + +dl.bug dt a { + color: var(--bug-color-hl) !important; +} + +dl.bug { + background: var(--bug-color-bg); + border-left: 8px solid var(--bug-color-hl); + color: var(--bug-color-text); +} + +dl.bug dt a { + color: var(--bug-color-hl) !important; +} + +dl.deprecated { + background: var(--deprecated-color-bg); + border-left: 8px solid var(--deprecated-color-hl); + color: var(--deprecated-color-text); +} + +dl.deprecated dt a { + color: var(--deprecated-color-hl) !important; +} + +dl.section dd, dl.bug dd, dl.deprecated dd, dl.todo dd, dl.test dd { + margin-inline-start: 0px; +} + +dl.invariant, dl.pre, dl.post { + background: var(--invariant-color-bg); + border-left: 8px solid var(--invariant-color-hl); + color: var(--invariant-color-text); +} + +dl.invariant dt, dl.pre dt, dl.post dt { + color: var(--invariant-color-hl); } @@ -1585,12 +1765,12 @@ dl.section dd { vertical-align: bottom; border-collapse: separate; } - + #projectlogo img -{ +{ border: 0px none; } - + #projectalign { vertical-align: middle; diff --git a/doxygen_crawl.html b/doxygen_crawl.html new file mode 100644 index 000000000..79709b9a4 --- /dev/null +++ b/doxygen_crawl.html @@ -0,0 +1,833 @@ + + + +Validator / crawler helper + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dynsections.js b/dynsections.js index b73c82889..8f493264f 100644 --- a/dynsections.js +++ b/dynsections.js @@ -22,171 +22,173 @@ @licend The above is the entire license notice for the JavaScript code in this file */ -function toggleVisibility(linkObj) -{ - var base = $(linkObj).attr('id'); - var summary = $('#'+base+'-summary'); - var content = $('#'+base+'-content'); - var trigger = $('#'+base+'-trigger'); - var src=$(trigger).attr('src'); - if (content.is(':visible')===true) { - content.hide(); - summary.show(); - $(linkObj).addClass('closed').removeClass('opened'); - $(trigger).attr('src',src.substring(0,src.length-8)+'closed.png'); - } else { - content.show(); - summary.hide(); - $(linkObj).removeClass('closed').addClass('opened'); - $(trigger).attr('src',src.substring(0,src.length-10)+'open.png'); - } - return false; -} - -function updateStripes() -{ - $('table.directory tr'). - removeClass('even').filter(':visible:even').addClass('even'); - $('table.directory tr'). - removeClass('odd').filter(':visible:odd').addClass('odd'); -} - -function toggleLevel(level) -{ - $('table.directory tr').each(function() { - var l = this.id.split('_').length-1; - var i = $('#img'+this.id.substring(3)); - var a = $('#arr'+this.id.substring(3)); - if (l'); - // add vertical lines to other rows - $('span[class=lineno]').not(':eq(0)').append(''); - // add toggle controls to lines with fold divs - $('div[class=foldopen]').each(function() { - // extract specific id to use - var id = $(this).attr('id').replace('foldopen',''); - // extract start and end foldable fragment attributes - var start = $(this).attr('data-start'); - var end = $(this).attr('data-end'); - // replace normal fold span with controls for the first line of a foldable fragment - $(this).find('span[class=fold]:first').replaceWith(''); - // append div for folded (closed) representation - $(this).after(''); - // extract the first line from the "open" section to represent closed content - var line = $(this).children().first().clone(); - // remove any glow that might still be active on the original line - $(line).removeClass('glow'); - if (start) { - // if line already ends with a start marker (e.g. trailing {), remove it - $(line).html($(line).html().replace(new RegExp('\\s*'+start+'\\s*$','g'),'')); + this.updateStripes(); + }, + + toggleFolder : function(id) { + // the clicked row + const currentRow = $('#row_'+id); + + // all rows after the clicked row + const rows = currentRow.nextAll("tr"); + + const re = new RegExp('^row_'+id+'\\d+_$', "i"); //only one sub + + // only match elements AFTER this one (can't hide elements before) + const childRows = rows.filter(function() { return this.id.match(re); }); + + // first row is visible we are HIDING + if (childRows.filter(':first').is(':visible')===true) { + // replace down arrow by right arrow for current row + const currentRowSpans = currentRow.find("span"); + currentRowSpans.filter(".iconfopen").removeClass("iconfopen").addClass("iconfclosed"); + currentRowSpans.filter(".arrow").html('►'); + rows.filter("[id^=row_"+id+"]").hide(); // hide all children + } else { // we are SHOWING + // replace right arrow by down arrow for current row + const currentRowSpans = currentRow.find("span"); + currentRowSpans.filter(".iconfclosed").removeClass("iconfclosed").addClass("iconfopen"); + currentRowSpans.filter(".arrow").html('▼'); + // replace down arrows by right arrows for child rows + const childRowsSpans = childRows.find("span"); + childRowsSpans.filter(".iconfopen").removeClass("iconfopen").addClass("iconfclosed"); + childRowsSpans.filter(".arrow").html('►'); + childRows.show(); //show all children } - // replace minus with plus symbol - $(line).find('span[class=fold]').css('background-image',plusImg[relPath]); - // append ellipsis - $(line).append(' '+start+''+end); - // insert constructed line into closed div - $('#foldclosed'+id).html(line); - }); -} - + this.updateStripes(); + }, + + toggleInherit : function(id) { + const rows = $('tr.inherit.'+id); + const img = $('tr.inherit_header.'+id+' img'); + const src = $(img).attr('src'); + if (rows.filter(':first').is(':visible')===true) { + rows.css('display','none'); + $(img).attr('src',src.substring(0,src.length-8)+'closed.png'); + } else { + rows.css('display','table-row'); // using show() causes jump in firefox + $(img).attr('src',src.substring(0,src.length-10)+'open.png'); + } + }, +}; + +let codefold = { + opened : true, + + // in case HTML_COLORSTYLE is LIGHT or DARK the vars will be replaced, so we write them out explicitly and use double quotes + plusImg: [ "var(--fold-plus-image)", "var(--fold-plus-image-relpath)" ], + minusImg: [ "var(--fold-minus-image)", "var(--fold-minus-image-relpath)" ], + + // toggle all folding blocks + toggle_all : function(relPath) { + if (this.opened) { + $('#fold_all').css('background-image',this.plusImg[relPath]); + $('div[id^=foldopen]').hide(); + $('div[id^=foldclosed]').show(); + } else { + $('#fold_all').css('background-image',this.minusImg[relPath]); + $('div[id^=foldopen]').show(); + $('div[id^=foldclosed]').hide(); + } + this.opened=!this.opened; + }, + + // toggle single folding block + toggle : function(id) { + $('#foldopen'+id).toggle(); + $('#foldclosed'+id).toggle(); + }, + + init : function(relPath) { + $('span[class=lineno]').css({ + 'padding-right':'4px', + 'margin-right':'2px', + 'display':'inline-block', + 'width':'54px', + 'background':'linear-gradient(var(--fold-line-color),var(--fold-line-color)) no-repeat 46px/2px 100%' + }); + // add global toggle to first line + $('span[class=lineno]:first').append(''); + // add vertical lines to other rows + $('span[class=lineno]').not(':eq(0)').append(''); + // add toggle controls to lines with fold divs + $('div[class=foldopen]').each(function() { + // extract specific id to use + const id = $(this).attr('id').replace('foldopen',''); + // extract start and end foldable fragment attributes + const start = $(this).attr('data-start'); + const end = $(this).attr('data-end'); + // replace normal fold span with controls for the first line of a foldable fragment + $(this).find('span[class=fold]:first').replaceWith(''); + // append div for folded (closed) representation + $(this).after(''); + // extract the first line from the "open" section to represent closed content + const line = $(this).children().first().clone(); + // remove any glow that might still be active on the original line + $(line).removeClass('glow'); + if (start) { + // if line already ends with a start marker (e.g. trailing {), remove it + $(line).html($(line).html().replace(new RegExp('\\s*'+start+'\\s*$','g'),'')); + } + // replace minus with plus symbol + $(line).find('span[class=fold]').css('background-image',codefold.plusImg[relPath]); + // append ellipsis + $(line).append(' '+start+''+end); + // insert constructed line into closed div + $('#foldclosed'+id).html(line); + }); + }, +}; /* @license-end */ diff --git a/embedding__backward__dense__host_8cpp.html b/embedding__backward__dense__host_8cpp.html new file mode 100644 index 000000000..fe5c96e7c --- /dev/null +++ b/embedding__backward__dense__host_8cpp.html @@ -0,0 +1,674 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_backward_dense_host.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_backward_dense_host.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ dense_embedding_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor dense_embedding_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ dense_embedding_codegen_forward_weighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor dense_embedding_codegen_forward_weighted_cuda (const Tensor & dev_weights,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ dense_embedding_codegen_grad_indice_weights_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor dense_embedding_codegen_grad_indice_weights_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & feature_requires_grad )
                    +
                    + +
                    +
                    + +

                    ◆ dense_embedding_nobag_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor dense_embedding_nobag_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_backward_codegen_dense_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_dense_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const double unused )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_backward_codegen_dense_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_dense_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const double unused )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_lookup_dense_function()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_lookup_dense_function (Tensor dev_weights,
                    Tensor weights_offsets,
                    Tensor D_offsets,
                    int64_t total_D,
                    int64_t max_D,
                    Tensor hash_size_cumsum,
                    int64_t total_hash_size_bits,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    c10::optional< Tensor > indice_weights,
                    c10::optional< Tensor > feature_requires_grad,
                    int64_t output_dtype = static_cast<int64_t>(SparseType::FP32) )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_backward_codegen_dense_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_dense_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const double unused )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fb ,
                    m  )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__backward__dense__host__cpu_8cpp.html b/embedding__backward__dense__host__cpu_8cpp.html new file mode 100644 index 000000000..7c6724d7e --- /dev/null +++ b/embedding__backward__dense__host__cpu_8cpp.html @@ -0,0 +1,180 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_backward_dense_host_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_backward_dense_host_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_dense_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_dense_cpu (Tensor grad_output,
                    Tensor host_weights,
                    Tensor weights_offsets,
                    Tensor D_offsets,
                    int64_t max_D,
                    Tensor hash_size_cumsum,
                    int64_t total_hash_size_bits,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    Tensor indice_weights,
                    double unused )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__backward__split__cpu__approx__template_8cpp.html b/embedding__backward__split__cpu__approx__template_8cpp.html new file mode 100644 index 000000000..20bee62f2 --- /dev/null +++ b/embedding__backward__split__cpu__approx__template_8cpp.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_backward_split_cpu_approx_template.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_backward_split_cpu_approx_template.cpp File Reference
                    +
                    +
                    +
                    #include <map>
                    +#include <tuple>
                    +#include <ATen/ATen.h>
                    +#include <ATen/AccumulateType.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm/FbgemmEmbedding.h"
                    +#include "fbgemm_gpu/cpu_utils.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__backward__split__cpu__template_8cpp.html b/embedding__backward__split__cpu__template_8cpp.html new file mode 100644 index 000000000..fd1b88bba --- /dev/null +++ b/embedding__backward__split__cpu__template_8cpp.html @@ -0,0 +1,124 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_backward_split_cpu_template.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    embedding_backward_split_cpu_template.cpp File Reference
                    +
                    +
                    +
                    #include <map>
                    +#include <tuple>
                    +#include <utility>
                    +#include <ATen/ATen.h>
                    +#include <ATen/AccumulateType.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm/FbgemmEmbedding.h"
                    +#include "fbgemm/Types.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/cpu_utils.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +
                    + + + +

                    +Namespaces

                    namespace  internal
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__backward__split__grad__template_8cu.html b/embedding__backward__split__grad__template_8cu.html new file mode 100644 index 000000000..9dbbff37b --- /dev/null +++ b/embedding__backward__split__grad__template_8cu.html @@ -0,0 +1,142 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_backward_split_grad_template.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_backward_split_grad_template.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    +
                    +template<typename info_pta_t , typename info_t , bool nobag>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__backward__split__host__cpu__template_8cpp.html b/embedding__backward__split__host__cpu__template_8cpp.html new file mode 100644 index 000000000..59d277d7b --- /dev/null +++ b/embedding__backward__split__host__cpu__template_8cpp.html @@ -0,0 +1,108 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_backward_split_host_cpu_template.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_backward_split_host_cpu_template.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__backward__split__host__template_8cpp.html b/embedding__backward__split__host__template_8cpp.html new file mode 100644 index 000000000..360a53d99 --- /dev/null +++ b/embedding__backward__split__host__template_8cpp.html @@ -0,0 +1,124 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_backward_split_host_template.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ split_embedding

                    + +
                    +
                    + + + + +
                    Tensor split_embedding
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__backward__split__indice__weights__template_8cu.html b/embedding__backward__split__indice__weights__template_8cu.html new file mode 100644 index 000000000..2017e618e --- /dev/null +++ b/embedding__backward__split__indice__weights__template_8cu.html @@ -0,0 +1,89 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_backward_split_indice_weights_template.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_backward_split_indice_weights_template.cu File Reference
                    +
                    + + + + + diff --git a/embedding__backward__split__kernel__cta__template_8cu.html b/embedding__backward__split__kernel__cta__template_8cu.html new file mode 100644 index 000000000..aea6e0108 --- /dev/null +++ b/embedding__backward__split__kernel__cta__template_8cu.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_backward_split_kernel_cta_template.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_backward_split_kernel_cta_template.cu File Reference
                    +
                    +
                    +
                    #include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
                    +#include "fbgemm_gpu/fbgemm_tensor_accessor.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +#include "gen_embedding_optimizer_{{ optimizer }}_split_device_kernel.cuh"
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__()

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__backward__split__kernel__warp__template_8cu.html b/embedding__backward__split__kernel__warp__template_8cu.html new file mode 100644 index 000000000..e911704ae --- /dev/null +++ b/embedding__backward__split__kernel__warp__template_8cu.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_backward_split_kernel_warp_template.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_backward_split_kernel_warp_template.cu File Reference
                    +
                    +
                    +
                    #include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
                    +#include "fbgemm_gpu/fbgemm_tensor_accessor.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +#include "gen_embedding_optimizer_{{ optimizer }}_split_device_kernel.cuh"
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__()

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__backward__split__template_8cu.html b/embedding__backward__split__template_8cu.html new file mode 100644 index 000000000..cb30333bb --- /dev/null +++ b/embedding__backward__split__template_8cu.html @@ -0,0 +1,161 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_backward_split_template.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_backward_split_template.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    constexpr auto kMaxVecsPerThread = {{ max_embedding_dim
                    +
                    +
                    +
                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    constexpr int kThreadGroupSize = kWarpSize; \
                    +
                    {%- for kMaxElemPerThread in range(1, max_embedding_dim
                    +
                    template __global__ kWarpSize
                    Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1952
                    +
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__()

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__backward__template__helpers_8cuh.html b/embedding__backward__template__helpers_8cuh.html new file mode 100644 index 000000000..a906c7702 --- /dev/null +++ b/embedding__backward__template__helpers_8cuh.html @@ -0,0 +1,197 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/embedding_backward_template_helpers.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    embedding_backward_template_helpers.cuh File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/AccumulateType.h>
                    +#include <ATen/TensorUtils.h>
                    +#include <ATen/core/TensorAccessor.h>
                    +#include <ATen/cuda/CUDAContext.h>
                    +#include <ATen/cuda/CUDAGeneratorImpl.h>
                    +#include <c10/cuda/CUDAGuard.h>
                    +#include <ATen/cuda/Atomic.cuh>
                    +#include <ATen/cuda/CUDAGraphsUtils.cuh>
                    +#include <cuda.h>
                    +#include <cuda_runtime.h>
                    +#include <curand_kernel.h>
                    +#include <mutex>
                    +#include "dispatch_macros.h"
                    +#include "embedding_common.h"
                    +#include "fbgemm_cuda_utils.cuh"
                    +#include "sparse_ops_utils.h"
                    +
                    + + + +

                    +Namespaces

                    namespace  fbgemm_gpu
                     
                    +

                    Macro Definition Documentation

                    + +

                    ◆ SHFL_SYNC

                    + +
                    +
                    + + + + + + + + + + + +
                    #define SHFL_SYNC( val,
                    srcLane )    shfl_sync(val, srcLane, kThreadGroupSize, shfl_sync_mask)
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ gpuAtomicIncrement()

                    + +
                    +
                    + + + + + + + +
                    DEVICE_INLINE int64_t gpuAtomicIncrement (int64_t * p)
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ kBackwardMaxThreads

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr size_t kBackwardMaxThreads = 512
                    +
                    +constexpr
                    +
                    + +
                    +
                    + +

                    ◆ kCacheLocationMissing

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr int32_t kCacheLocationMissing = -1
                    +
                    +constexpr
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__bounds__check_8cu.html b/embedding__bounds__check_8cu.html new file mode 100644 index 000000000..924f18405 --- /dev/null +++ b/embedding__bounds__check_8cu.html @@ -0,0 +1,163 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_bounds_check.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_bounds_check.cu File Reference
                    +
                    +
                    +
                    #include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
                    +#include <c10/cuda/CUDADeviceAssertion.h>
                    +#include <c10/cuda/CUDAException.h>
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__()

                    + +
                    +
                    +
                    +template<typename index_t , bool vbe>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ adjust_offset_kernel()

                    + +
                    +
                    +
                    +template<typename index_t >
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    __device__ void adjust_offset_kernel (index_t & indices_start,
                    index_t & indices_end,
                    const index_t num_indices,
                    index_t *const offset_acc_start,
                    index_t *const offset_acc_end )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__bounds__check__host_8cpp.html b/embedding__bounds__check__host_8cpp.html new file mode 100644 index 000000000..ce48eb209 --- /dev/null +++ b/embedding__bounds__check__host_8cpp.html @@ -0,0 +1,151 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_bounds_check_host.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_bounds_check_host.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <ATen/cuda/CUDAContext.h>
                    +#include <torch/library.h>
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fb ,
                    m  )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__bounds__check__host__cpu_8cpp.html b/embedding__bounds__check__host__cpu_8cpp.html new file mode 100644 index 000000000..a058f8b95 --- /dev/null +++ b/embedding__bounds__check__host__cpu_8cpp.html @@ -0,0 +1,152 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_bounds_check_host_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_bounds_check_host_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fb ,
                    m  )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__common_8h.html b/embedding__common_8h.html new file mode 100644 index 000000000..52f451ff6 --- /dev/null +++ b/embedding__common_8h.html @@ -0,0 +1,100 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/embedding_common.h File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    embedding_common.h File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <c10/macros/Macros.h>
                    +#include <cstdint>
                    +
                    + + + + + +

                    +Namespaces

                    namespace  fbgemm_gpu
                     
                    namespace  nbit
                     
                    +
                    + + + + diff --git a/embedding__forward__quantized__cpu__template_8cpp.html b/embedding__forward__quantized__cpu__template_8cpp.html new file mode 100644 index 000000000..99ed3b18c --- /dev/null +++ b/embedding__forward__quantized__cpu__template_8cpp.html @@ -0,0 +1,96 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_forward_quantized_cpu_template.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_forward_quantized_cpu_template.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/Context.h>
                    +#include <ATen/Parallel.h>
                    +#include "fbgemm_gpu/cpu_utils.h"
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm/FbgemmEmbedding.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include <cstring>
                    +
                    + + + + diff --git a/embedding__forward__quantized__host_8cpp.html b/embedding__forward__quantized__host_8cpp.html new file mode 100644 index 000000000..eacca4977 --- /dev/null +++ b/embedding__forward__quantized__host_8cpp.html @@ -0,0 +1,500 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_forward_quantized_host.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    embedding_forward_quantized_host.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <ATen/cuda/CUDAContext.h>
                    +#include <torch/library.h>
                    +#include "c10/core/ScalarType.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_cache_cuda.cuh"
                    +#include <algorithm>
                    +
                    + + + +

                    +Functions

                    Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function (Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t pooling_mode, c10::optional< Tensor > indice_weights, int64_t output_dtype, c10::optional< Tensor > lxu_cache_weights, c10::optional< Tensor > lxu_cache_locations, c10::optional< int64_t > row_alignment, c10::optional< int64_t > max_float8_D, c10::optional< int64_t > fp8_exponent_bits, c10::optional< int64_t > fp8_exponent_bias, c10::optional< Tensor > cache_hash_size_cumsum, c10::optional< int64_t > total_cache_hash_size, c10::optional< Tensor > cache_index_table_map, c10::optional< Tensor > lxu_cache_state, c10::optional< Tensor > lxu_state)
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ int_nbit_split_embedding_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor int_nbit_split_embedding_codegen_forward_unweighted_cuda (Tensor dev_weights,
                    Tensor uvm_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor weights_tys,
                    Tensor D_offsets,
                    int64_t total_D,
                    int64_t max_int2_D,
                    int64_t max_int4_D,
                    int64_t max_int8_D,
                    int64_t max_float16_D,
                    int64_t max_float32_D,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    int64_t row_alignment,
                    int64_t output_dtype,
                    Tensor lxu_cache_weights,
                    Tensor lxu_cache_locations,
                    int64_t max_float8_D,
                    int64_t fp8_exponent_bits,
                    int64_t fp8_exponent_bias )
                    +
                    + +
                    +
                    + +

                    ◆ int_nbit_split_embedding_codegen_forward_weighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor int_nbit_split_embedding_codegen_forward_weighted_cuda (Tensor dev_weights,
                    Tensor uvm_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor weights_tys,
                    Tensor D_offsets,
                    int64_t total_D,
                    int64_t max_int2_D,
                    int64_t max_int4_D,
                    int64_t max_int8_D,
                    int64_t max_float16_D,
                    int64_t max_float32_D,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    int64_t row_alignment,
                    Tensor indice_weights,
                    int64_t output_dtype,
                    Tensor lxu_cache_weights,
                    Tensor lxu_cache_locations,
                    int64_t max_float8_D,
                    int64_t fp8_exponent_bits,
                    int64_t fp8_exponent_bias )
                    +
                    + +
                    +
                    + +

                    ◆ int_nbit_split_embedding_nobag_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor int_nbit_split_embedding_nobag_codegen_forward_unweighted_cuda (Tensor dev_weights,
                    Tensor uvm_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor weights_tys,
                    int64_t D,
                    int64_t max_int2_D,
                    int64_t max_int4_D,
                    int64_t max_int8_D,
                    int64_t max_float16_D,
                    int64_t max_float32_D,
                    Tensor indices,
                    Tensor offsets,
                    int64_t row_alignment,
                    int64_t output_dtype,
                    Tensor lxu_cache_weights,
                    Tensor lxu_cache_locations,
                    int64_t max_float8_D,
                    int64_t fp8_exponent_bits,
                    int64_t fp8_exponent_bias )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__forward__quantized__host__cpu_8cpp.html b/embedding__forward__quantized__host__cpu_8cpp.html new file mode 100644 index 000000000..d6e3433b6 --- /dev/null +++ b/embedding__forward__quantized__host__cpu_8cpp.html @@ -0,0 +1,378 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_forward_quantized_host_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_forward_quantized_host_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/custom_class.h>
                    +#include <torch/script.h>
                    +#include <ostream>
                    +#include <torch/serialize/input-archive.h>
                    +#include <torch/serialize/output-archive.h>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ int_nbit_split_embedding_codegen_forward_unweighted_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor int_nbit_split_embedding_codegen_forward_unweighted_cpu (Tensor dev_weights,
                    Tensor uvm_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor weights_tys,
                    Tensor D_offsets,
                    int64_t total_D,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    int64_t row_alignment,
                    int64_t output_dtype,
                    int64_t fp8_exponent_bits,
                    int64_t fp8_exponent_bias )
                    +
                    + +
                    +
                    + +

                    ◆ int_nbit_split_embedding_codegen_forward_weighted_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor int_nbit_split_embedding_codegen_forward_weighted_cpu (Tensor dev_weights,
                    Tensor uvm_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor weights_tys,
                    Tensor D_offsets,
                    int64_t total_D,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    int64_t row_alignment,
                    Tensor indice_weights,
                    int64_t output_dtype,
                    int64_t fp8_exponent_bits,
                    int64_t fp8_exponent_bias )
                    +
                    + +
                    +
                    + +

                    ◆ int_nbit_split_embedding_nobag_codegen_forward_unweighted_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor int_nbit_split_embedding_nobag_codegen_forward_unweighted_cpu (Tensor dev_weights,
                    Tensor uvm_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor weights_tys,
                    int64_t D,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    int64_t row_alignment,
                    int64_t output_dtype,
                    int64_t fp8_exponent_bits,
                    int64_t fp8_exponent_bias )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__forward__quantized__split__lookup_8cu.html b/embedding__forward__quantized__split__lookup_8cu.html new file mode 100644 index 000000000..6b35298a6 --- /dev/null +++ b/embedding__forward__quantized__split__lookup_8cu.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_forward_quantized_split_lookup.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    embedding_forward_quantized_split_lookup.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__forward__quantized__split__nbit__host__template_8cu.html b/embedding__forward__quantized__split__nbit__host__template_8cu.html new file mode 100644 index 000000000..59945f3e4 --- /dev/null +++ b/embedding__forward__quantized__split__nbit__host__template_8cu.html @@ -0,0 +1,545 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_forward_quantized_split_nbit_host_template.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    embedding_forward_quantized_split_nbit_host_template.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Macro Definition Documentation

                    + +

                    ◆ X [1/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::INT2_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{{ wdesc }}_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, weights_tys, uint8_t, 1, 32), \
                    +
                    {% if not nobag %} \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, D_offsets, int32_t, 1, 32), \
                    +
                    {% else %} \
                    +
                    D, \
                    +
                    {% endif %} \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, offsets, index_t, 1, 32), \
                    +
                    {% if not nobag %} \
                    +
                    pooling_mode, \
                    +
                    {% endif %} \
                    +
                    row_alignment, \
                    +
                    {% if weighted %} MAKE_PTA_WITH_NAME(func_name1, indice_weights, float, 1, 32), {% endif %} \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    #define MAKE_PTA_WITH_NAME(FUNC_NAME, TENSOR, T, N, INDEX_NBITS)
                    Definition fbgemm_tensor_accessor.h:577
                    +
                    template uint8_t
                    Definition gen_batch_index_select_dim0_forward_kernel.cu:1240
                    +
                    template int64_t
                    Definition gen_batch_index_select_dim0_forward_kernel.cu:1241
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const int32_t const bool pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > output
                    Definition gen_batch_index_select_dim0_forward_kernel_small.cu:128
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights
                    Definition gen_batch_index_select_dim0_forward_kernel_small.cu:119
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets
                    Definition gen_batch_index_select_dim0_forward_kernel_small.cu:120
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets
                    Definition gen_batch_index_select_dim0_forward_kernel_small.cu:121
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices
                    Definition gen_batch_index_select_dim0_forward_kernel_small.cu:123
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > int64_t D
                    Definition gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu:101
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > int64_t FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets
                    Definition gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu:104
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const lxu_cache_weights
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:58
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const uvm_weights
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:57
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const const int32_t *__restrict__ const const uint32_t B
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:60
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const const int32_t *__restrict__ const const uint32_t const uint32_t T
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:61
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const const int32_t *__restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t *__restrict__ const const index_t *__restrict__ const const uint32_t *__restrict__ const const int64_t *__restrict__ const const int32_t *__restrict__ const lxu_cache_locations
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:69
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const const int32_t *__restrict__ const weights_placements
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:59
                    +
                    template __global__ kWarpSize
                    Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1952
                    +
                    C10_HOST_DEVICE C10_ALWAYS_INLINE uint32_t div_round_up(uint32_t a, uint32_t b)
                    Definition embedding_common.h:94
                    +
                    +
                    +
                    + +

                    ◆ X [2/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::INT4_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{{ wdesc }}_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, weights_tys, uint8_t, 1, 32), \
                    +
                    {% if not nobag %} \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, D_offsets, int32_t, 1, 32), \
                    +
                    {% else %} \
                    +
                    D, \
                    +
                    {% endif %} \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, offsets, index_t, 1, 32), \
                    +
                    {% if not nobag %} \
                    +
                    pooling_mode, \
                    +
                    {% endif %} \
                    +
                    row_alignment, \
                    +
                    {% if weighted %} MAKE_PTA_WITH_NAME(func_name2, indice_weights, float, 1, 32), {% endif %} \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ X [3/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::INT8_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{{ wdesc }}_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, weights_tys, uint8_t, 1, 32), \
                    +
                    {% if not nobag %} \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, D_offsets, int32_t, 1, 32), \
                    +
                    {% else %} \
                    +
                    D, \
                    +
                    {% endif %} \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, offsets, index_t, 1, 32), \
                    +
                    {% if not nobag %} \
                    +
                    pooling_mode, \
                    +
                    {% endif %} \
                    +
                    row_alignment, \
                    +
                    {% if weighted %} MAKE_PTA_WITH_NAME(func_name3, indice_weights, float, 1, 32), {% endif %} \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ X [4/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::FP8_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{{ wdesc }}_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, weights_tys, uint8_t, 1, 32), \
                    +
                    {% if not nobag %} \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, D_offsets, int32_t, 1, 32), \
                    +
                    {% else %} \
                    +
                    D, \
                    +
                    {% endif %} \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, offsets, index_t, 1, 32), \
                    +
                    {% if not nobag %} \
                    +
                    pooling_mode, \
                    +
                    {% endif %} \
                    +
                    row_alignment, \
                    +
                    {% if weighted %} MAKE_PTA_WITH_NAME(func_name4, indice_weights, float, 1, 32), {% endif %} \
                    +
                    fp8_exponent_bits, \
                    +
                    fp8_exponent_bias, \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ X [5/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::FP16_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{{ wdesc }}_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, weights_tys, uint8_t, 1, 32), \
                    +
                    {% if not nobag %} \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, D_offsets, int32_t, 1, 32), \
                    +
                    {% else %} \
                    +
                    D, \
                    +
                    {% endif %} \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, offsets, index_t, 1, 32), \
                    +
                    {% if not nobag %} \
                    +
                    pooling_mode, \
                    +
                    {% endif %} \
                    +
                    row_alignment, \
                    +
                    {% if weighted %} MAKE_PTA_WITH_NAME(func_name5, indice_weights, float, 1, 32), {% endif %} \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ X [6/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::FP32_split_embedding{{ "_nobag" if nobag else "" }}_codegen_forward_{{ wdesc }}_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, weights_tys, uint8_t, 1, 32), \
                    +
                    {% if not nobag %} \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, D_offsets, int32_t, 1, 32), \
                    +
                    {% else %} \
                    +
                    D, \
                    +
                    {% endif %} \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, offsets, index_t, 1, 32), \
                    +
                    {% if not nobag %} \
                    +
                    pooling_mode, \
                    +
                    {% endif %} \
                    +
                    row_alignment, \
                    +
                    {% if weighted %} MAKE_PTA_WITH_NAME(func_name6, indice_weights, float, 1, 32), {% endif %} \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ Y

                    + +
                    +
                    + + + + + + + +
                    #define Y( ...)
                    +
                    +Value:
                    if (device_only) { \
                    +
                    X(true, __VA_ARGS__) \
                    +
                    } else { \
                    +
                    X(false, __VA_ARGS__) \
                    +
                    };
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor
                    +
                    +Initial value:
                    {% set wdesc = "weighted" if weighted else "unweighted" %}
                    +
                    +
                    +
                    +
                    using namespace fbgemm_gpu at::Tensor
                    +
                    Definition embedding_ops_placeholder.cpp:15
                    +
                    +
                    +
                    +
                    + + + + diff --git a/embedding__forward__quantized__split__nbit__kernel__template_8cu.html b/embedding__forward__quantized__split__nbit__kernel__template_8cu.html new file mode 100644 index 000000000..b64d32f71 --- /dev/null +++ b/embedding__forward__quantized__split__nbit__kernel__template_8cu.html @@ -0,0 +1,110 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_forward_quantized_split_nbit_kernel_template.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_forward_quantized_split_nbit_kernel_template.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor
                    +
                    +Initial value:
                    {% set wdesc = "weighted" if weighted else "unweighted" %}
                    +
                    +
                    +
                    +
                    using namespace fbgemm_gpu at::Tensor
                    +
                    Definition embedding_ops_placeholder.cpp:15
                    +
                    +
                    +
                    +
                    + + + + diff --git a/embedding__forward__split__cpu_8cpp.html b/embedding__forward__split__cpu_8cpp.html new file mode 100644 index 000000000..cbaffecf9 --- /dev/null +++ b/embedding__forward__split__cpu_8cpp.html @@ -0,0 +1,448 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_forward_split_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    embedding_forward_split_cpu.cpp File Reference
                    +
                    +
                    +
                    #include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm/FbgemmEmbedding.h"
                    +#include "fbgemm/Types.h"
                    +#include "fbgemm/Utils.h"
                    +#include "fbgemm_gpu/cpu_utils.h"
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include <omp.h>
                    +#include <ATen/ATen.h>
                    +#include <ATen/AccumulateType.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +
                    + + + +

                    +Namespaces

                    namespace  internal
                     
                    +

                    Macro Definition Documentation

                    + +

                    ◆ INSTANTIATE_BATCHED_CSR2CSC

                    + +
                    +
                    + + + + + + + +
                    #define INSTANTIATE_BATCHED_CSR2CSC( SCALAR_T)
                    +
                    +Value:
                    template void csr2csc_template_<SCALAR_T, true>( \
                    +
                    HyperCompressedSparseColumn & csc, \
                    +
                    int B, \
                    +
                    const at::TensorAccessor<int64_t, 1>& csr_offsets, \
                    +
                    const at::TensorAccessor<int64_t, 1>& csr_indices, \
                    +
                    const at::TensorAccessor<SCALAR_T, 1>& csr_weights, \
                    + + + +
                    \
                    + +
                    HyperCompressedSparseColumn & csc, \
                    +
                    int B, \
                    +
                    const at::TensorAccessor<int64_t, 1>& csr_offsets, \
                    +
                    const at::TensorAccessor<int64_t, 1>& csr_indices, \
                    +
                    const at::TensorAccessor<SCALAR_T, 1>& csr_weights, \
                    + + + +
                    Definition fbgemm_tensor_accessor.h:128
                    +
                    template int64_t
                    Definition gen_batch_index_select_dim0_forward_kernel.cu:1241
                    +
                    __global__ const int32_t B
                    Definition sparse_batched_unary_embeddings.cu:20
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_codegen_forward_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_cpu (Tensor weights,
                    Tensor weights_offsets,
                    Tensor D_offsets,
                    int64_t total_D,
                    Tensor hash_size_cumsum,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    Tensor indice_weights,
                    int64_t output_dtype )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_cpu_meta()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_cpu_meta (Tensor weights,
                    Tensor weights_offsets,
                    Tensor D_offsets,
                    int64_t total_D,
                    Tensor hash_size_cumsum,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    Tensor indice_weights,
                    int64_t output_dtype )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_grad_indice_weights_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_grad_indice_weights_cpu (Tensor grad_output,
                    Tensor weights,
                    Tensor weights_offsets,
                    Tensor D_offsets,
                    Tensor indices,
                    Tensor offsets,
                    Tensor feature_requires_grad )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_forward_cpu_kernel()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    void split_embedding_forward_cpu_kernel (Tensor weights,
                    Tensor weights_offsets,
                    Tensor D_offsets,
                    int64_t total_D,
                    Tensor hash_size_cumsum,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    Tensor indice_weights,
                    Tensor output )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_grad_indice_weights_cpu_kernel()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    void split_embedding_grad_indice_weights_cpu_kernel (Tensor grad_output,
                    Tensor weights,
                    Tensor weights_offsets,
                    Tensor D_offsets,
                    Tensor indices,
                    Tensor offsets,
                    Tensor feature_requires_grad,
                    Tensor grad_indice_weights )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__forward__split__cpu_8h.html b/embedding__forward__split__cpu_8h.html new file mode 100644 index 000000000..c95eb1dc3 --- /dev/null +++ b/embedding__forward__split__cpu_8h.html @@ -0,0 +1,212 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_forward_split_cpu.h File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    embedding_forward_split_cpu.h File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/Parallel.h>
                    +#include "fbgemm/Utils.h"
                    +
                    + + + +

                    +Classes

                    struct  HyperCompressedSparseColumn
                     
                    + + + +

                    +Namespaces

                    namespace  internal
                     
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_codegen_forward_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    at::Tensor split_embedding_codegen_forward_cpu (at::Tensor weights,
                    at::Tensor weights_offsets,
                    at::Tensor D_offsets,
                    int64_t total_D,
                    at::Tensor hash_size_cumsum,
                    at::Tensor indices,
                    at::Tensor offsets,
                    int64_t pooling_mode,
                    at::Tensor indice_weights,
                    int64_t output_dtype = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_grad_indice_weights_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    at::Tensor split_embedding_codegen_grad_indice_weights_cpu (at::Tensor grad_output,
                    at::Tensor weights,
                    at::Tensor weights_offsets,
                    at::Tensor D_offsets,
                    at::Tensor indices,
                    at::Tensor offsets,
                    at::Tensor feature_requires_grad )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__forward__split__kernel__nobag__small__template_8cu.html b/embedding__forward__split__kernel__nobag__small__template_8cu.html new file mode 100644 index 000000000..adccb6fb5 --- /dev/null +++ b/embedding__forward__split__kernel__nobag__small__template_8cu.html @@ -0,0 +1,108 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_forward_split_kernel_nobag_small_template.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_forward_split_kernel_nobag_small_template.cu File Reference
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , typename index_t , size_t kThreadGroupSize>
                    + + + + + + + +
                    __launch_bounds__ (kForwardMaxThreads )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__forward__split__kernel__template_8cu.html b/embedding__forward__split__kernel__template_8cu.html new file mode 100644 index 000000000..69c6417c5 --- /dev/null +++ b/embedding__forward__split__kernel__template_8cu.html @@ -0,0 +1,88 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_forward_split_kernel_template.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_forward_split_kernel_template.cu File Reference
                    +
                    + + + + + diff --git a/embedding__forward__split__kernel__v2__template_8cu.html b/embedding__forward__split__kernel__v2__template_8cu.html new file mode 100644 index 000000000..a2b84af51 --- /dev/null +++ b/embedding__forward__split__kernel__v2__template_8cu.html @@ -0,0 +1,785 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_forward_split_kernel_v2_template.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    embedding_forward_split_kernel_v2_template.cu File Reference
                    +
                    +
                    + + + + + + + + +

                    +Classes

                    struct  Vec4Type< float >
                     
                    struct  Vec4Type< at::Half >
                     
                    struct  Vec4Type< uint8_t >
                     
                    +

                    Macro Definition Documentation

                    + +

                    ◆ ACC_ADD_OR_FMA

                    + +
                    +
                    + + + + + + + + + + + +
                    #define ACC_ADD_OR_FMA( WEIGHT,
                    INDEX_WEIGHT )    {%- if weighted %}
                    +
                    + +
                    +
                    + +

                    ◆ DIV_ROUND_UP

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DIV_ROUND_UP( numer,
                    denom )   ((numer + denom - 1) / denom)
                    +
                    + +
                    +
                    + +

                    ◆ INVOKE_PROCESS_ALL_INDICES

                    + +
                    +
                    + + + + + + + +
                    #define INVOKE_PROCESS_ALL_INDICES( ...)
                    +
                    +Value:
                    if (use_lxu_cache) { \
                    +
                    INVOKE_PROCESS_ALL_INDICES_HELPER(true, __VA_ARGS__); \
                    +
                    } \
                    +
                    else { \
                    +
                    INVOKE_PROCESS_ALL_INDICES_HELPER(false, __VA_ARGS__); \
                    +
                    }
                    +
                    bool use_lxu_cache
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:746
                    +
                    +
                    +
                    + +

                    ◆ INVOKE_PROCESS_ALL_INDICES_HELPER

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + +
                    #define INVOKE_PROCESS_ALL_INDICES_HELPER( USE_CACHE,
                    KERNEL_TYPE,
                    TAIL_WARP_SIZE,
                    STEP_MASK )
                    +
                    +Value:
                    process_all_indices_## KERNEL_TYPE< \
                    +
                    index_t, \
                    +
                    emb_t, \
                    +
                    emb_vec_t, \
                    +
                    cache_t, \
                    + + +
                    USE_CACHE, \
                    +
                    USE_CACHE && !std::is_same<emb_t, cache_t>::value, \
                    + +
                    STEP, \
                    +
                    STEP_MASK, \
                    +
                    TAIL_WARP_SIZE \
                    +
                    >( \
                    +
                    smem, \
                    +
                    L, \
                    +
                    load_d + (threadIdx.x % TAIL_WARP_SIZE) < load_D, \
                    + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const const int32_t *__restrict__ const const uint32_t const uint32_t const bool const uint32_t max_D_cache
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:63
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const const int32_t *__restrict__ const const uint32_t const uint32_t const bool mean_pooling
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:62
                    +
                    uint32_t load_D
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:724
                    +
                    const uint32_t params_offset
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:674
                    +
                    vec4_type< output_t > output_vec_t
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:667
                    +
                    constexpr uint32_t NUM_PARAMS
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:671
                    +
                    vec4_type< cache_t > cache_vec_t
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:666
                    +
                    uint32_t L
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:744
                    +
                    const uint32_t load_d
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:741
                    +
                    __shared__ long smem[NUM_PARAMS *NUM_WARPS+kForwardMaxThreads]
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:673
                    +
                    constexpr uint32_t STEP
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:672
                    +
                    constexpr uint32_t NUM_WARPS
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:669
                    +
                    +
                    +
                    + +

                    ◆ SMEM_CACHE_WEIGHT_DATA

                    + +
                    +
                    + + + + + + + + + + + +
                    #define SMEM_CACHE_WEIGHT_DATA( SMEM_IDX,
                    WEIGHT_IDX )    (SMEM_PTR_BASE(const cache_vec_t**)[SMEM_IDX])[WEIGHT_IDX]
                    +
                    + +
                    +
                    + +

                    ◆ SMEM_CACHE_WEIGHT_PTR

                    + +
                    +
                    + + + + +
                    #define SMEM_CACHE_WEIGHT_PTR   SMEM_PTR_BASE(const cache_vec_t**)
                    +
                    + +
                    +
                    + +

                    ◆ SMEM_EMB_WEIGHT_DATA

                    + +
                    +
                    + + + + + + + + + + + +
                    #define SMEM_EMB_WEIGHT_DATA( SMEM_IDX,
                    WEIGHT_IDX )    (SMEM_PTR_BASE(const emb_vec_t**)[SMEM_IDX])[WEIGHT_IDX]
                    +
                    + +
                    +
                    + +

                    ◆ SMEM_EMB_WEIGHT_PTR

                    + +
                    +
                    + + + + +
                    #define SMEM_EMB_WEIGHT_PTR   SMEM_PTR_BASE(const emb_vec_t**)
                    +
                    + +
                    +
                    + +

                    ◆ SMEM_GENERIC_PTR

                    + +
                    +
                    + + + + +
                    #define SMEM_GENERIC_PTR   SMEM_PTR_BASE(uintptr_t*)
                    +
                    + +
                    +
                    + +

                    ◆ SMEM_OFFSET

                    + +
                    +
                    + + + + +
                    #define SMEM_OFFSET    (IS_FULL_WARP ? j : ((threadIdx.x / LOAD_GROUP_SIZE) + (j * NUM_LOAD_GROUPS)))
                    +
                    + +
                    +
                    + +

                    ◆ SMEM_PTR_BASE

                    + +
                    +
                    + + + + + + + +
                    #define SMEM_PTR_BASE( TYPE)    (reinterpret_cast<TYPE>(smem + WEIGHT_PTR_OFFSET) + threadIdx.y * kWarpSize)
                    +
                    + +
                    +
                    + +

                    ◆ WEIGHT_OFFSET

                    + +
                    +
                    + + + + +
                    #define WEIGHT_OFFSET    (IS_FULL_WARP ? threadIdx.x : (threadIdx.x % LOAD_GROUP_SIZE))
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ vec4_type

                    + +
                    +
                    +
                    +template<typename T >
                    + + + + +
                    using vec4_type = typename Vec4Type<T>::type
                    +
                    + +
                    +
                    +

                    Enumeration Type Documentation

                    + +

                    ◆ SAVED_PARAMS

                    + +
                    +
                    + + + + +
                    enum SAVED_PARAMS
                    +
                    + + + + + + + +
                    Enumerator
                    P_indices 
                    P_weights 
                    P_outputs 
                    P_num_offsets 
                    P_load_D 
                    P_total_load_D 
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , typename index_t , bool USE_LXU_CACHE>
                    + + + + + + + + + + + +
                    __launch_bounds__ (kForwardMaxThreads ,
                    2048/ kForwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ _v2_kernel()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    _v2_kernel (const emb_t *__restrict__ const dev_weights,
                    const emb_t *__restrict__ const uvm_weights,
                    const cache_t *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const index_t *__restrict__ const indices,
                    {%- if weighted %} const float *__restrict__ const index_weights,
                    {%- endif %} const index_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    output_t *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ fma()

                    + +
                    +
                    + + + + + + + + + + + +
                    accumulator fma (WEIGHT ,
                    INDEX_WEIGHT  )
                    +
                    + +
                    +
                    + +

                    ◆ get_next_bag_boundary_and_L()

                    + +
                    +
                    +
                    +template<uint32_t LOWER_BIT_CNT, uint32_t WARP_MASK>
                    + + + + + + + + + + + + + + + + +
                    __inline__ __device__ void get_next_bag_boundary_and_L (const uint32_t bag_boundary,
                    int32_t *const next_boundary,
                    uint32_t *const L )
                    +
                    + +
                    +
                    + +

                    ◆ process_all_indices_large_Ls()

                    + +
                    +
                    +
                    +template<typename index_t , typename emb_t , typename emb_vec_t , typename cache_t , typename cache_vec_t , typename output_vec_t , bool USE_CACHE_WEIGHTS, bool USE_MIXED_TYPE_CACHE, uint32_t WEIGHT_PTR_OFFSET, uint32_t STEP, uint32_t STEP_MASK, uint32_t LOAD_GROUP_SIZE>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    __noinline__ __device__ void process_all_indices_large_Ls (long *const smem,
                    const uint32_t L,
                    const bool process_d,
                    const bool mean_pooling,
                    const uint32_t params_offset,
                    const uint32_t max_D_cache )
                    +
                    + +
                    +
                    + +

                    ◆ process_all_indices_no_pooling()

                    + +
                    +
                    +
                    +template<typename index_t , typename emb_t , typename emb_vec_t , typename output_vec_t , uint32_t STEP>
                    + + + + + + + + + + + + + + + + +
                    __inline__ __device__ void process_all_indices_no_pooling (long *const smem,
                    const bool process_d,
                    const uint32_t params_offset )
                    +
                    + +
                    +
                    + +

                    ◆ process_all_indices_small_Ls()

                    + +
                    +
                    +
                    +template<typename index_t , typename emb_t , typename emb_vec_t , typename cache_t , typename cache_vec_t , typename output_vec_t , bool USE_CACHE_WEIGHTS, bool USE_MIXED_TYPE_CACHE, uint32_t WEIGHT_PTR_OFFSET, uint32_t STEP, uint32_t STEP_MASK, uint32_t LOAD_GROUP_SIZE>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    __noinline__ __device__ void process_all_indices_small_Ls (long *const smem,
                    const uint32_t total_L,
                    const bool process_d,
                    const bool mean_pooling,
                    const uint32_t params_offset,
                    const uint32_t max_D_cache )
                    +
                    + +
                    +
                    + +

                    ◆ write_loop_small_Ls()

                    + +
                    +
                    +
                    +template<typename emb_t , typename output_vec_t , uint32_t STEP, uint32_t BOUNDARY_IDX_BIT_CNT, uint32_t WARP_MASK>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    __inline__ __device__ void write_loop_small_Ls (long *const smem,
                    uint32_t *const write_idx,
                    uint32_t *const bag_boundary,
                    int32_t *const next_boundary,
                    uint32_t *const L,
                    Vec4StepT< STEP, emb_t > *const accumulator,
                    const uint32_t params_offset,
                    const uint32_t l,
                    const bool process_d,
                    const bool mean_pooling )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ LXU_PARAMS_CNT

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr uint32_t LXU_PARAMS_CNT = 2
                    +
                    +constexpr
                    +
                    + +
                    +
                    + +

                    ◆ VEC_WIDTH

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr uint32_t VEC_WIDTH = 4
                    +
                    +constexpr
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__forward__split__meta__template_8cpp.html b/embedding__forward__split__meta__template_8cpp.html new file mode 100644 index 000000000..0de2d8805 --- /dev/null +++ b/embedding__forward__split__meta__template_8cpp.html @@ -0,0 +1,129 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_forward_split_meta_template.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_forward_split_meta_template.cpp File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor
                    +
                    +Initial value:
                    {#
                    +
                    +
                    +
                    +
                    +
                    +
                    +
                    +
                    +
                    +
                    +
                    {%- set ddesc = "dense" if dense else "split" %}
                    +
                    {%- set wdesc = "weighted" if weighted else "unweighted" %}
                    +
                    {%- set vdesc = "_vbe" if vbe else "" %}
                    +
                    +
                    +
                    +
                    +
                    +
                    +
                    +
                    +
                    using namespace fbgemm_gpu at::Tensor
                    +
                    Definition embedding_ops_placeholder.cpp:15
                    +
                    +
                    +
                    +
                    + + + + diff --git a/embedding__forward__split__template_8cu.html b/embedding__forward__split__template_8cu.html new file mode 100644 index 000000000..b12e4be27 --- /dev/null +++ b/embedding__forward__split__template_8cu.html @@ -0,0 +1,163 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_forward_split_template.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_forward_split_template.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_KERNEL_FOR_CACHE_CASE

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_KERNEL_FOR_CACHE_CASE( CACHE_CASE_,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    {%- if dense %}
                    +
                    Definition fbgemm_tensor_accessor.h:128
                    +
                    +
                    +
                    + +

                    ◆ DISPATCH_OPTIMAL_FORWARD_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_FORWARD_KERNEL( MAX_D_,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    constexpr int kThreadGroupSize = kWarpSize; \
                    + +
                    template __global__ kWarpSize
                    Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1952
                    +
                    +
                    +
                    + +

                    ◆ DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL( DD_,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    {%- for kEmbeddingSize in [4, 8, 16, 32] %}
                    +
                    +
                    +
                    +
                    + + + + diff --git a/embedding__forward__template__helpers_8cuh.html b/embedding__forward__template__helpers_8cuh.html new file mode 100644 index 000000000..263930ba5 --- /dev/null +++ b/embedding__forward__template__helpers_8cuh.html @@ -0,0 +1,191 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_forward_template_helpers.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    embedding_forward_template_helpers.cuh File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/AccumulateType.h>
                    +#include <ATen/TensorUtils.h>
                    +#include <ATen/core/TensorAccessor.h>
                    +#include <ATen/cuda/CUDAContext.h>
                    +#include <c10/cuda/CUDAGuard.h>
                    +#include <ATen/cuda/Atomic.cuh>
                    +#include "fbgemm_gpu/cub_namespace_prefix.cuh"
                    +#include <cub/device/device_radix_sort.cuh>
                    +#include <cub/device/device_run_length_encode.cuh>
                    +#include <cub/device/device_scan.cuh>
                    +#include "fbgemm_gpu/cub_namespace_postfix.cuh"
                    +#include <cuda.h>
                    +#include <cuda_runtime.h>
                    +#include <curand_kernel.h>
                    +#include <limits>
                    +#include <mutex>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
                    +#include "fbgemm_gpu/fbgemm_tensor_accessor.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +
                    + + + +

                    +Namespaces

                    namespace  nbit
                     
                    + + + + +

                    +Functions

                    template<int N>
                    __device__ __forceinline__ void cp_async_wait ()
                     
                    +

                    Macro Definition Documentation

                    + +

                    ◆ SHFL_SYNC

                    + +
                    +
                    + + + + + + + + + + + +
                    #define SHFL_SYNC( val,
                    srcLane )    shfl_sync(val, srcLane, kThreadGroupSize, shfl_sync_mask)
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ kCacheLocationMissing

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr int32_t kCacheLocationMissing = -1
                    +
                    +constexpr
                    +
                    + +
                    +
                    + +

                    ◆ kForwardMaxThreads

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr size_t kForwardMaxThreads = 512
                    +
                    +constexpr
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__inplace__update_8cu.html b/embedding__inplace__update_8cu.html new file mode 100644 index 000000000..8a9fb4a94 --- /dev/null +++ b/embedding__inplace__update_8cu.html @@ -0,0 +1,123 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    embedding_inplace_update.cu File Reference
                    +
                    +
                    +
                    #include <cuda.h>
                    +#include <cuda_runtime.h>
                    +#include <c10/cuda/CUDAGuard.h>
                    +#include "fbgemm_gpu/embedding_inplace_update.h"
                    +#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
                    +
                    + + + +

                    +Namespaces

                    namespace  fbgemm_gpu
                     
                    + + + + + +

                    +Functions

                    void embedding_inplace_update_cuda (Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor update_weights, Tensor update_table_idx, Tensor update_row_idx, Tensor update_offsets, const int64_t row_alignment, c10::optional< Tensor > lxu_cache_weights=c10::nullopt, c10::optional< Tensor > lxu_cache_locations=c10::nullopt)
                     
                    Tensor pruned_array_lookup_from_row_idx_cuda (const Tensor &update_row_indices, const Tensor &update_table_indices, const Tensor &index_remappings, const Tensor &index_remappings_offsets)
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__inplace__update_8h.html b/embedding__inplace__update_8h.html new file mode 100644 index 000000000..1fd3c3077 --- /dev/null +++ b/embedding__inplace__update_8h.html @@ -0,0 +1,123 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/embedding_inplace_update.h File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    embedding_inplace_update.h File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/TensorAccessor.h>
                    +#include <torch/torch.h>
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +
                    + + + +

                    +Namespaces

                    namespace  fbgemm_gpu
                     
                    + + + + + +

                    +Functions

                    void embedding_inplace_update_cuda (Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor update_weights, Tensor update_table_idx, Tensor update_row_idx, Tensor update_offsets, const int64_t row_alignment, c10::optional< Tensor > lxu_cache_weights=c10::nullopt, c10::optional< Tensor > lxu_cache_locations=c10::nullopt)
                     
                    Tensor pruned_array_lookup_from_row_idx_cuda (const Tensor &update_row_indices, const Tensor &update_table_indices, const Tensor &index_remappings, const Tensor &index_remappings_offsets)
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__inplace__update__cpu_8cpp.html b/embedding__inplace__update__cpu_8cpp.html new file mode 100644 index 000000000..d62289b71 --- /dev/null +++ b/embedding__inplace__update__cpu_8cpp.html @@ -0,0 +1,138 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    embedding_inplace_update_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <algorithm>
                    +#include <cmath>
                    +#include <functional>
                    +#include <ATen/ATen.h>
                    +#include <torch/library.h>
                    +#include "fbgemm_gpu/embedding_inplace_update.h"
                    +
                    + + + +

                    +Namespaces

                    namespace  fbgemm_gpu
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__inplace__update__gpu_8cpp.html b/embedding__inplace__update__gpu_8cpp.html new file mode 100644 index 000000000..02d7feaf4 --- /dev/null +++ b/embedding__inplace__update__gpu_8cpp.html @@ -0,0 +1,114 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_gpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_inplace_update_gpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <ATen/cuda/CUDAContext.h>
                    +#include <torch/library.h>
                    +#include "fbgemm_gpu/embedding_inplace_update.h"
                    +

                    Function Documentation

                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__inplace__update__test_8cpp.html b/embedding__inplace__update__test_8cpp.html new file mode 100644 index 000000000..7a9cfe080 --- /dev/null +++ b/embedding__inplace__update__test_8cpp.html @@ -0,0 +1,162 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/embedding_inplace_ops/embedding_inplace_update_test.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_inplace_update_test.cpp File Reference
                    +
                    +
                    +
                    #include <folly/Random.h>
                    +#include <gtest/gtest.h>
                    +#include "fbgemm_gpu/embedding_inplace_update.h"
                    +

                    Function Documentation

                    + +

                    ◆ get_D_bytes()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + +
                    int32_t get_D_bytes (Tensor D_offsets,
                    Tensor weights_tys,
                    const int32_t table_idx,
                    const int64_t row_alignment )
                    +
                    + +
                    +
                    + +

                    ◆ TEST()

                    + +
                    +
                    + + + + + + + + + + + +
                    TEST (EmbeddingInplaceUpdateTest ,
                    random_update  )
                    +
                    + +
                    +
                    + +

                    ◆ test_embedding_inplace_update()

                    + +
                    +
                    +
                    +template<typename index_t >
                    + + + + + + + +
                    void test_embedding_inplace_update ()
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__op__registration_8h.html b/embedding__op__registration_8h.html new file mode 100644 index 000000000..c69299436 --- /dev/null +++ b/embedding__op__registration_8h.html @@ -0,0 +1,89 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_op_registration.h File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_op_registration.h File Reference
                    +
                    +
                    +
                    #include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/library.h>
                    +
                    + + + + diff --git a/embedding__ops__placeholder_8cpp.html b/embedding__ops__placeholder_8cpp.html new file mode 100644 index 000000000..7121fad52 --- /dev/null +++ b/embedding__ops__placeholder_8cpp.html @@ -0,0 +1,95 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_ops_placeholder.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    embedding_ops_placeholder.cpp File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  fbgemm_gpu
                     
                    +
                    + + + + diff --git a/embedding__optimizer__split__device__kernel__template_8cuh.html b/embedding__optimizer__split__device__kernel__template_8cuh.html new file mode 100644 index 000000000..4cc085e05 --- /dev/null +++ b/embedding__optimizer__split__device__kernel__template_8cuh.html @@ -0,0 +1,90 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_optimizer_split_device_kernel_template.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_optimizer_split_device_kernel_template.cuh File Reference
                    +
                    + + + + + diff --git a/embedding__optimizer__split__host__template_8cpp.html b/embedding__optimizer__split__host__template_8cpp.html new file mode 100644 index 000000000..937d44a4f --- /dev/null +++ b/embedding__optimizer__split__host__template_8cpp.html @@ -0,0 +1,169 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_optimizer_split_host_template.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_optimizer_split_host_template.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ split_embedding_

                    + +
                    +
                    + + + + +
                    void split_embedding_
                    +
                    +Initial value:
                    + + + + + + + +
                    const int64_t max_D,
                    +
                    const bool stochastic_rounding,
                    +
                    {{ args.split_function_args | join(", ") }})
                    +
                    Definition fbgemm_tensor_accessor.h:128
                    +
                    at::Tensor Tensor
                    Definition gen_batch_index_select_dim0_backward_codegen_cuda.cu:15
                    +
                    template int64_t
                    Definition gen_batch_index_select_dim0_forward_kernel.cu:1241
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights
                    Definition gen_batch_index_select_dim0_forward_kernel_small.cu:119
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets
                    Definition gen_batch_index_select_dim0_forward_kernel_small.cu:120
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const lxu_cache_weights
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:58
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const uvm_weights
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:57
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const const int32_t *__restrict__ const weights_placements
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:59
                    +
                    template __global__ at::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights
                    Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1959
                    +
                    template __global__ at::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const int32_t max_D
                    Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1967
                    +
                    template __global__ at::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const int32_t bool stochastic_rounding
                    Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1968
                    +
                    template __global__ at::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_dev_indices
                    Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1961
                    +
                    +
                    +
                    +
                    + + + + diff --git a/embedding__optimizer__split__kernel__template_8cu.html b/embedding__optimizer__split__kernel__template_8cu.html new file mode 100644 index 000000000..8f57e9a48 --- /dev/null +++ b/embedding__optimizer__split__kernel__template_8cu.html @@ -0,0 +1,179 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_optimizer_split_kernel_template.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_optimizer_split_kernel_template.cu File Reference
                    +
                    +
                    +
                    #include "gen_embedding_optimizer_{{ optimizer }}_split_device_kernel.cuh"
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ _update_kernel()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    _update_kernel (at::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > dev_weights,
                    at::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > uvm_weights,
                    at::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const at::PackedTensorAccessor32< emb_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_dev_indices,
                    const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const int32_t max_D,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    {{ args.split_kernel_args|join(", ") }}  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/embedding__optimizer__split__template_8cu.html b/embedding__optimizer__split__template_8cu.html new file mode 100644 index 000000000..0f7c891d5 --- /dev/null +++ b/embedding__optimizer__split__template_8cu.html @@ -0,0 +1,194 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/codegen/embedding_optimizer_split_template.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    embedding_optimizer_split_template.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ _update_kernel()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    _update_kernel (at::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > dev_weights,
                    at::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > uvm_weights,
                    at::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const at::PackedTensorAccessor32< emb_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_dev_indices,
                    const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const int32_t max_D,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    {{ args.split_kernel_args|join(", ") }}  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/enum__utils_8h.html b/enum__utils_8h.html new file mode 100644 index 000000000..86013b353 --- /dev/null +++ b/enum__utils_8h.html @@ -0,0 +1,260 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/enum_utils.h File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    enum_utils.h File Reference
                    +
                    +
                    +
                    #include <string>
                    +#include <tuple>
                    +#include <vector>
                    +
                    + + + +

                    +Classes

                    class  enum_registration< T >
                     
                    + + + +

                    +Namespaces

                    namespace  fbgemm_gpu
                     
                    +

                    Macro Definition Documentation

                    + +

                    ◆ FBGEMM_GPU_ENUM_CREATE_TAG

                    + +
                    +
                    + + + + + + + +
                    #define FBGEMM_GPU_ENUM_CREATE_TAG( module_name)
                    +
                    +Value:
                    struct fbgemm_gpu_enum_tag_##module_name {}; \
                    +
                    template <> \
                    +
                    enum_registration<struct fbgemm_gpu_enum_tag_##module_name>* \
                    +
                    enum_registration< \
                    +
                    struct fbgemm_gpu_enum_tag_##module_name>::registration_list; \
                    +
                    extern template class enum_registration< \
                    +
                    struct fbgemm_gpu_enum_tag_##module_name>;
                    +
                    +
                    +
                    + +

                    ◆ FBGEMM_GPU_ENUM_GLOGAL

                    + +
                    +
                    + + + + + + + +
                    #define FBGEMM_GPU_ENUM_GLOGAL( module_name)
                    +
                    +Value:
                    template class enum_registration<FBGEMM_GPU_ENUM_TAG(module_name)>; \
                    +
                    template <> \
                    +
                    enum_registration<FBGEMM_GPU_ENUM_TAG(module_name)>* \
                    +
                    enum_registration<FBGEMM_GPU_ENUM_TAG(module_name)>::registration_list = \
                    +
                    nullptr;
                    +
                    #define FBGEMM_GPU_ENUM_TAG(module_name)
                    Definition enum_utils.h:26
                    +
                    +
                    +
                    + +

                    ◆ FBGEMM_GPU_ENUM_ITEM

                    + +
                    +
                    + + + + + + + + + + + + + + + + +
                    #define FBGEMM_GPU_ENUM_ITEM( x,
                    y,
                    z )    { #x #y, z }
                    +
                    + +
                    +
                    + +

                    ◆ FBGEMM_GPU_ENUM_OP

                    + +
                    +
                    + + + + + + + + + + + +
                    #define FBGEMM_GPU_ENUM_OP( module_name,
                    op_name )
                    +
                    +Value:
                    #op_name "() -> ((str, (str, int)[])[])", \
                    +
                    TORCH_FN(enum_query <FBGEMM_GPU_ENUM_TAG(module_name)>)
                    +
                    +
                    +
                    + +

                    ◆ FBGEMM_GPU_ENUM_REGISTER_END

                    + +
                    +
                    + + + + +
                    #define FBGEMM_GPU_ENUM_REGISTER_END   );
                    +
                    + +
                    +
                    + +

                    ◆ FBGEMM_GPU_ENUM_REGISTER_START

                    + +
                    +
                    + + + + + + + + + + + + + + + + +
                    #define FBGEMM_GPU_ENUM_REGISTER_START( module_name,
                    prefix,
                    enum_name )
                    +
                    +Value:
                    enum_registration<FBGEMM_GPU_ENUM_TAG(module_name)> fbgemm_fpu_enum_reg_ \
                    +
                    ## prefix ## enum_name( #prefix #enum_name,
                    +
                    +
                    +
                    + +

                    ◆ FBGEMM_GPU_ENUM_TAG

                    + +
                    +
                    + + + + + + + +
                    #define FBGEMM_GPU_ENUM_TAG( module_name)    struct fbgemm_gpu_enum_tag_##module_name
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/fbgemm__cuda__utils_8cuh.html b/fbgemm__cuda__utils_8cuh.html new file mode 100644 index 000000000..d9ff35436 --- /dev/null +++ b/fbgemm__cuda__utils_8cuh.html @@ -0,0 +1,249 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    fbgemm_cuda_utils.cuh File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/AccumulateType.h>
                    +#include <ATen/cuda/CUDAGraphsUtils.cuh>
                    +#include "fbgemm_gpu/cub_namespace_prefix.cuh"
                    +#include <cub/block/block_scan.cuh>
                    +#include "fbgemm_gpu/cub_namespace_postfix.cuh"
                    +#include <cuda.h>
                    +#include <cuda_bf16.h>
                    +#include <cuda_fp16.h>
                    +#include <cuda_runtime.h>
                    +#include <curand_kernel.h>
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

                    +Classes

                    struct  Half4
                     
                    struct  Vec4T< T >
                     
                    struct  Vec4T< float >
                     
                    struct  Vec4T< at::Half >
                     
                    struct  Vec4T< at::BFloat16 >
                     
                    struct  Vec4T< double >
                     
                    struct  Comparator< T >
                     
                    struct  BitonicSort< K, V, Dir, Comp >
                     
                    struct  StochasticRoundingRNGState
                     
                    struct  WeightRow< emb_t, cache_t, dst_t >
                     
                    struct  SharedMemory< int64_t >
                     
                    struct  SharedMemory< int32_t >
                     
                    struct  SharedMemory< float >
                     
                    struct  SharedMemory< double >
                     
                    struct  SharedMemory< Vec4T< at::acc_type< float, true > > >
                     
                    struct  SharedMemory< Vec4T< at::acc_type< double, true > > >
                     
                    struct  VecNT< N, PrimitiveType >
                     
                    struct  VecNT< 1, PrimitiveType::FP >
                     
                    struct  VecNT< 2, PrimitiveType::FP >
                     
                    struct  VecNT< 4, PrimitiveType::FP >
                     
                    struct  VecNT< 4, PrimitiveType::INT >
                     
                    struct  VecNT< 8, PrimitiveType::INT >
                     
                    struct  VecNT< 16, PrimitiveType::INT >
                     
                    struct  Vec4AccT
                     
                    struct  Vec4StepT< STEP, input_t >
                     
                    struct  Vec4StepT< STEP, float >
                     
                    struct  Vec4StepT< STEP, at::Half >
                     
                    struct  Vec4StepT< STEP, uint8_t >
                     
                    class  FixedDivisor
                     
                    + + + +

                    +Namespaces

                    namespace  fbgemm_gpu
                     
                    + + + + +

                    +Functions

                    template<typename scalar_t , int ITEMS_PER_THREAD, int NUM_THREADS_PER_BLOCK>
                    __inline__ __device__ void inclusive_sum_scan_kernel (scalar_t(&arr)[ITEMS_PER_THREAD], typename cub::BlockScan< scalar_t, NUM_THREADS_PER_BLOCK >::TempStorage &temp_storage, int *block_flags, volatile scalar_t *block_sums, scalar_t *block_prev, const int num_entries_per_block, const int block_id, const bool is_multi_block, const int signal)
                     
                    +

                    Macro Definition Documentation

                    + +

                    ◆ __HALF2_TO_UI

                    + +
                    +
                    + + + + + + + +
                    #define __HALF2_TO_UI( var)   *(reinterpret_cast<unsigned int*>(&(var)))
                    +
                    + +
                    +
                    + +

                    ◆ DEVICE_INLINE

                    + +
                    +
                    + + + + +
                    #define DEVICE_INLINE   __device__ inline __attribute__((always_inline))
                    +
                    + +
                    +
                    + +

                    ◆ max

                    + +
                    +
                    + + + + + + + + + + + +
                    #define max( a,
                    b )   ((a) > (b) ? (a) : (b))
                    +
                    + +
                    +
                    + +

                    ◆ min

                    + +
                    +
                    + + + + + + + + + + + +
                    #define min( a,
                    b )   ((a) < (b) ? (a) : (b))
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/fbgemm__tensor__accessor_8h.html b/fbgemm__tensor__accessor_8h.html new file mode 100644 index 000000000..48c47ec92 --- /dev/null +++ b/fbgemm__tensor__accessor_8h.html @@ -0,0 +1,339 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/fbgemm_tensor_accessor.h File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    fbgemm_tensor_accessor.h File Reference
                    +
                    +
                    +
                    #include <c10/macros/Macros.h>
                    +#include <c10/util/ArrayRef.h>
                    +#include <c10/util/Deprecated.h>
                    +#include <c10/util/Exception.h>
                    +#include <c10/util/irange.h>
                    +#include <cstddef>
                    +#include <cstdint>
                    +
                    + + + + + + + + + + + + + + + +

                    +Classes

                    struct  DefaultPtrTraits< T >
                     
                    class  TensorAccessorBase< T, N, PtrTraits, index_t >
                     
                    class  TensorAccessor< T, N, PtrTraits, index_t >
                     
                    class  TensorAccessor< T, 1, PtrTraits, index_t >
                     
                    class  GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >
                     
                    class  GenericPackedTensorAccessor< T, N, PtrTraits, index_t >
                     
                    class  GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >
                     
                    + + + +

                    +Namespaces

                    namespace  fbgemm_gpu
                     
                    +

                    Macro Definition Documentation

                    + +

                    ◆ AT_X

                    + +
                    +
                    + + + + +
                    #define AT_X   GenericPackedTensorAccessor<T, N, PtrTraits, index_t>
                    +
                    + +
                    +
                    + +

                    ◆ MAKE_PACKED_TENSOR_ACCESSOR_ACC_TYPE_BASE

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define MAKE_PACKED_TENSOR_ACCESSOR_ACC_TYPE_BASE( FUNC_NAME,
                    TENSOR,
                    T,
                    N,
                    PTR_TRAITS,
                    INDEX_NBITS )
                    +
                    +Value:
                    make_packed_tensor_accessor##INDEX_NBITS< \
                    +
                    at::acc_type<T, true>, \
                    +
                    N, \
                    +
                    PTR_TRAITS>(TENSOR)
                    +
                    +
                    +
                    + +

                    ◆ MAKE_PACKED_TENSOR_ACCESSOR_BASE

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define MAKE_PACKED_TENSOR_ACCESSOR_BASE( FUNC_NAME,
                    TENSOR,
                    T,
                    N,
                    PTR_TRAITS,
                    INDEX_NBITS )    make_packed_tensor_accessor##INDEX_NBITS<T, N, PTR_TRAITS>(TENSOR)
                    +
                    + +
                    +
                    + +

                    ◆ MAKE_PTA_ACC_WITH_NAME

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define MAKE_PTA_ACC_WITH_NAME( FUNC_NAME,
                    TENSOR,
                    T,
                    N,
                    INDEX_NBITS )
                    +
                    +Value:
                    +
                    FUNC_NAME, TENSOR, T, N, at::RestrictPtrTraits, INDEX_NBITS)
                    +
                    #define MAKE_PACKED_TENSOR_ACCESSOR_ACC_TYPE_BASE( FUNC_NAME, TENSOR, T, N, PTR_TRAITS, INDEX_NBITS)
                    Definition fbgemm_tensor_accessor.h:569
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const const int32_t *__restrict__ const const uint32_t const uint32_t T
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:61
                    +
                    +
                    +
                    + +

                    ◆ MAKE_PTA_WITH_NAME

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define MAKE_PTA_WITH_NAME( FUNC_NAME,
                    TENSOR,
                    T,
                    N,
                    INDEX_NBITS )
                    +
                    +Value:
                    +
                    FUNC_NAME, TENSOR, T, N, at::RestrictPtrTraits, INDEX_NBITS)
                    +
                    #define MAKE_PACKED_TENSOR_ACCESSOR_BASE( FUNC_NAME, TENSOR, T, N, PTR_TRAITS, INDEX_NBITS)
                    Definition fbgemm_tensor_accessor.h:565
                    +
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ make_packed_tensor_accessor32()

                    + +
                    +
                    +
                    +template<typename T , size_t N, template< typename U > class PtrTraits = at::DefaultPtrTraits>
                    + + + + + + + +
                    pta::PackedTensorAccessor32< T, N, PtrTraits > make_packed_tensor_accessor32 (const at::Tensor & tensor)
                    +
                    + +
                    +
                    + +

                    ◆ make_packed_tensor_accessor64()

                    + +
                    +
                    +
                    +template<typename T , size_t N, template< typename U > class PtrTraits = at::DefaultPtrTraits>
                    + + + + + + + +
                    pta::PackedTensorAccessor64< T, N, PtrTraits > make_packed_tensor_accessor64 (const at::Tensor & tensor)
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/files.html b/files.html new file mode 100644 index 000000000..bad18bb54 --- /dev/null +++ b/files.html @@ -0,0 +1,551 @@ + + + + + + + +fbgemm_gpu: File List + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    File List
                    +
                    +
                    +
                    Here is a list of all files with brief descriptions:
                    +
                    [detail level 1234567]
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                      _skbuild
                      linux-x86_64-3.12
                      bench
                     verify_fp16_stochastic_benchmark.cu
                      codegen
                     batch_index_select_dim0_cpu_host.cpp
                     batch_index_select_dim0_host.cpp
                     embedding_backward_dense_host.cpp
                     embedding_backward_dense_host_cpu.cpp
                     embedding_backward_split_cpu_approx_template.cpp
                     embedding_backward_split_cpu_template.cpp
                     embedding_backward_split_grad_template.cu
                     embedding_backward_split_host_cpu_template.cpp
                     embedding_backward_split_host_template.cpp
                     embedding_backward_split_indice_weights_template.cu
                     embedding_backward_split_kernel_cta_template.cu
                     embedding_backward_split_kernel_warp_template.cu
                     embedding_backward_split_template.cu
                     embedding_bounds_check.cu
                     embedding_bounds_check_host.cpp
                     embedding_bounds_check_host_cpu.cpp
                     embedding_forward_quantized_cpu_template.cpp
                     embedding_forward_quantized_host.cpp
                     embedding_forward_quantized_host_cpu.cpp
                     embedding_forward_quantized_split_lookup.cu
                     embedding_forward_quantized_split_nbit_host_template.cu
                     embedding_forward_quantized_split_nbit_kernel_template.cu
                     embedding_forward_split_cpu.cpp
                     embedding_forward_split_cpu.h
                     embedding_forward_split_kernel_nobag_small_template.cu
                     embedding_forward_split_kernel_template.cu
                     embedding_forward_split_kernel_v2_template.cu
                     embedding_forward_split_meta_template.cpp
                     embedding_forward_split_template.cu
                     embedding_forward_template_helpers.cuh
                     embedding_op_registration.h
                     embedding_ops_placeholder.cpp
                     embedding_optimizer_split_device_kernel_template.cuh
                     embedding_optimizer_split_host_template.cpp
                     embedding_optimizer_split_kernel_template.cu
                     embedding_optimizer_split_template.cu
                      include
                      fbgemm_gpu
                      src
                      embedding_inplace_ops
                      input_combine_ops
                      jagged_tensor_ops
                      layout_transform_ops
                      memory_utils
                      merge_pooled_embedding_ops
                      metric_ops
                      permute_pooled_embedding_ops
                      quantize_ops
                      sparse_ops
                      split_embeddings_cache
                      split_embeddings_utils
                      ssd_split_embeddings_cache
                     histogram_binning_calibration_ops.cu
                     topology_utils.cpp
                      test
                     cpu_kernel_test.cpp
                     sparse_ops_utils_test.cpp
                     tensor_assert_test.cpp
                     uvm_cache_miss_emulate_test.cpp
                    +
                    +
                    + + + + diff --git a/functions.html b/functions.html new file mode 100644 index 000000000..1b4444fc2 --- /dev/null +++ b/functions.html @@ -0,0 +1,89 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_b.html b/functions_b.html new file mode 100644 index 000000000..9c38e33d8 --- /dev/null +++ b/functions_b.html @@ -0,0 +1,87 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Here is a list of all class members with links to the classes they belong to:
                    + +

                    - b -

                    +
                    + + + + diff --git a/functions_c.html b/functions_c.html new file mode 100644 index 000000000..2ef6430c3 --- /dev/null +++ b/functions_c.html @@ -0,0 +1,93 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Here is a list of all class members with links to the classes they belong to:
                    + +

                    - c -

                    +
                    + + + + diff --git a/functions_d.html b/functions_d.html new file mode 100644 index 000000000..9a0dea2b4 --- /dev/null +++ b/functions_d.html @@ -0,0 +1,91 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_e.html b/functions_e.html new file mode 100644 index 000000000..93eea805b --- /dev/null +++ b/functions_e.html @@ -0,0 +1,89 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Here is a list of all class members with links to the classes they belong to:
                    + +

                    - e -

                    +
                    + + + + diff --git a/functions_eval.html b/functions_eval.html new file mode 100644 index 000000000..069461afe --- /dev/null +++ b/functions_eval.html @@ -0,0 +1,83 @@ + + + + + + + +fbgemm_gpu: Class Members - Enumerator + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Here is a list of all enum values with links to the classes they belong to:
                    +
                    + + + + diff --git a/functions_f.html b/functions_f.html new file mode 100644 index 000000000..68d2dadc1 --- /dev/null +++ b/functions_f.html @@ -0,0 +1,91 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_func.html b/functions_func.html new file mode 100644 index 000000000..89cd04a3a --- /dev/null +++ b/functions_func.html @@ -0,0 +1,87 @@ + + + + + + + +fbgemm_gpu: Class Members - Functions + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_func_b.html b/functions_func_b.html new file mode 100644 index 000000000..6a9a86efd --- /dev/null +++ b/functions_func_b.html @@ -0,0 +1,86 @@ + + + + + + + +fbgemm_gpu: Class Members - Functions + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Here is a list of all functions with links to the classes they belong to:
                    + +

                    - b -

                    +
                    + + + + diff --git a/functions_func_c.html b/functions_func_c.html new file mode 100644 index 000000000..d6a97c289 --- /dev/null +++ b/functions_func_c.html @@ -0,0 +1,88 @@ + + + + + + + +fbgemm_gpu: Class Members - Functions + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Here is a list of all functions with links to the classes they belong to:
                    + +

                    - c -

                    +
                    + + + + diff --git a/functions_func_d.html b/functions_func_d.html new file mode 100644 index 000000000..b10bea37f --- /dev/null +++ b/functions_func_d.html @@ -0,0 +1,89 @@ + + + + + + + +fbgemm_gpu: Class Members - Functions + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Here is a list of all functions with links to the classes they belong to:
                    + +

                    - d -

                    +
                    + + + + diff --git a/functions_func_e.html b/functions_func_e.html new file mode 100644 index 000000000..6ca5393f2 --- /dev/null +++ b/functions_func_e.html @@ -0,0 +1,89 @@ + + + + + + + +fbgemm_gpu: Class Members - Functions + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Here is a list of all functions with links to the classes they belong to:
                    + +

                    - e -

                    +
                    + + + + diff --git a/functions_func_f.html b/functions_func_f.html new file mode 100644 index 000000000..93f4f547a --- /dev/null +++ b/functions_func_f.html @@ -0,0 +1,90 @@ + + + + + + + +fbgemm_gpu: Class Members - Functions + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_func_g.html b/functions_func_g.html new file mode 100644 index 000000000..495ebb191 --- /dev/null +++ b/functions_func_g.html @@ -0,0 +1,90 @@ + + + + + + + +fbgemm_gpu: Class Members - Functions + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_func_i.html b/functions_func_i.html new file mode 100644 index 000000000..2c4501ae8 --- /dev/null +++ b/functions_func_i.html @@ -0,0 +1,89 @@ + + + + + + + +fbgemm_gpu: Class Members - Functions + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_func_l.html b/functions_func_l.html new file mode 100644 index 000000000..b1bb90a5a --- /dev/null +++ b/functions_func_l.html @@ -0,0 +1,87 @@ + + + + + + + +fbgemm_gpu: Class Members - Functions + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_func_m.html b/functions_func_m.html new file mode 100644 index 000000000..967ed58ff --- /dev/null +++ b/functions_func_m.html @@ -0,0 +1,87 @@ + + + + + + + +fbgemm_gpu: Class Members - Functions + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_func_o.html b/functions_func_o.html new file mode 100644 index 000000000..a7b5aa62e --- /dev/null +++ b/functions_func_o.html @@ -0,0 +1,85 @@ + + + + + + + +fbgemm_gpu: Class Members - Functions + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_func_r.html b/functions_func_r.html new file mode 100644 index 000000000..2bbb7af07 --- /dev/null +++ b/functions_func_r.html @@ -0,0 +1,85 @@ + + + + + + + +fbgemm_gpu: Class Members - Functions + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Here is a list of all functions with links to the classes they belong to:
                    + +

                    - r -

                    +
                    + + + + diff --git a/functions_func_s.html b/functions_func_s.html new file mode 100644 index 000000000..1bb5cb06a --- /dev/null +++ b/functions_func_s.html @@ -0,0 +1,96 @@ + + + + + + + +fbgemm_gpu: Class Members - Functions + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_func_t.html b/functions_func_t.html new file mode 100644 index 000000000..cd770a189 --- /dev/null +++ b/functions_func_t.html @@ -0,0 +1,87 @@ + + + + + + + +fbgemm_gpu: Class Members - Functions + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Here is a list of all functions with links to the classes they belong to:
                    + +

                    - t -

                    +
                    + + + + diff --git a/functions_func_v.html b/functions_func_v.html new file mode 100644 index 000000000..67e180223 --- /dev/null +++ b/functions_func_v.html @@ -0,0 +1,88 @@ + + + + + + + +fbgemm_gpu: Class Members - Functions + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_func_w.html b/functions_func_w.html new file mode 100644 index 000000000..1257896d6 --- /dev/null +++ b/functions_func_w.html @@ -0,0 +1,88 @@ + + + + + + + +fbgemm_gpu: Class Members - Functions + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Here is a list of all functions with links to the classes they belong to:
                    + +

                    - w -

                    +
                    + + + + diff --git a/functions_func_~.html b/functions_func_~.html new file mode 100644 index 000000000..c21b00478 --- /dev/null +++ b/functions_func_~.html @@ -0,0 +1,86 @@ + + + + + + + +fbgemm_gpu: Class Members - Functions + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Here is a list of all functions with links to the classes they belong to:
                    + +

                    - ~ -

                    +
                    + + + + diff --git a/functions_g.html b/functions_g.html new file mode 100644 index 000000000..9eba548ca --- /dev/null +++ b/functions_g.html @@ -0,0 +1,91 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_h.html b/functions_h.html new file mode 100644 index 000000000..cf09084eb --- /dev/null +++ b/functions_h.html @@ -0,0 +1,85 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Here is a list of all class members with links to the classes they belong to:
                    + +

                    - h -

                    +
                    + + + + diff --git a/functions_i.html b/functions_i.html new file mode 100644 index 000000000..b5cb377fb --- /dev/null +++ b/functions_i.html @@ -0,0 +1,90 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_l.html b/functions_l.html new file mode 100644 index 000000000..33bbcb366 --- /dev/null +++ b/functions_l.html @@ -0,0 +1,88 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_m.html b/functions_m.html new file mode 100644 index 000000000..26a197f37 --- /dev/null +++ b/functions_m.html @@ -0,0 +1,87 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_n.html b/functions_n.html new file mode 100644 index 000000000..9c786a340 --- /dev/null +++ b/functions_n.html @@ -0,0 +1,89 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Here is a list of all class members with links to the classes they belong to:
                    + +

                    - n -

                    +
                    + + + + diff --git a/functions_o.html b/functions_o.html new file mode 100644 index 000000000..f6fc4c774 --- /dev/null +++ b/functions_o.html @@ -0,0 +1,85 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_p.html b/functions_p.html new file mode 100644 index 000000000..119cbcb7b --- /dev/null +++ b/functions_p.html @@ -0,0 +1,88 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_r.html b/functions_r.html new file mode 100644 index 000000000..d0f82dbf7 --- /dev/null +++ b/functions_r.html @@ -0,0 +1,89 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Here is a list of all class members with links to the classes they belong to:
                    + +

                    - r -

                    +
                    + + + + diff --git a/functions_s.html b/functions_s.html new file mode 100644 index 000000000..059156c7c --- /dev/null +++ b/functions_s.html @@ -0,0 +1,100 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_t.html b/functions_t.html new file mode 100644 index 000000000..0c2470f07 --- /dev/null +++ b/functions_t.html @@ -0,0 +1,88 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_type.html b/functions_type.html new file mode 100644 index 000000000..3033e7105 --- /dev/null +++ b/functions_type.html @@ -0,0 +1,84 @@ + + + + + + + +fbgemm_gpu: Class Members - Typedefs + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_v.html b/functions_v.html new file mode 100644 index 000000000..2c1af44d5 --- /dev/null +++ b/functions_v.html @@ -0,0 +1,90 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + + + + + diff --git a/functions_vars.html b/functions_vars.html new file mode 100644 index 000000000..ddb010a49 --- /dev/null +++ b/functions_vars.html @@ -0,0 +1,178 @@ + + + + + + + +fbgemm_gpu: Class Members - Variables + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Here is a list of all variables with links to the classes they belong to:
                    + +

                    - a -

                    + + +

                    - b -

                    + + +

                    - c -

                    + + +

                    - d -

                    + + +

                    - f -

                    + + +

                    - g -

                    + + +

                    - h -

                    + + +

                    - i -

                    + + +

                    - l -

                    + + +

                    - n -

                    + + +

                    - p -

                    + + +

                    - r -

                    + + +

                    - s -

                    + + +

                    - v -

                    + + +

                    - w -

                    + + +

                    - x -

                    +
                    + + + + diff --git a/functions_w.html b/functions_w.html new file mode 100644 index 000000000..e29f34d27 --- /dev/null +++ b/functions_w.html @@ -0,0 +1,89 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Here is a list of all class members with links to the classes they belong to:
                    + +

                    - w -

                    +
                    + + + + diff --git a/functions_x.html b/functions_x.html new file mode 100644 index 000000000..bfe3b8049 --- /dev/null +++ b/functions_x.html @@ -0,0 +1,85 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Here is a list of all class members with links to the classes they belong to:
                    + +

                    - x -

                    +
                    + + + + diff --git a/functions_~.html b/functions_~.html new file mode 100644 index 000000000..79ab911ee --- /dev/null +++ b/functions_~.html @@ -0,0 +1,86 @@ + + + + + + + +fbgemm_gpu: Class Members + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Here is a list of all class members with links to the classes they belong to:
                    + +

                    - ~ -

                    +
                    + + + + diff --git a/gen__batch__index__select__dim0__backward__codegen__cuda_8cu.html b/gen__batch__index__select__dim0__backward__codegen__cuda_8cu.html new file mode 100644 index 000000000..ad7e47add --- /dev/null +++ b/gen__batch__index__select__dim0__backward__codegen__cuda_8cu.html @@ -0,0 +1,210 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_batch_index_select_dim0_backward_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_batch_index_select_dim0_backward_codegen_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ batch_index_select_dim0_codegen_backward_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor batch_index_select_dim0_codegen_backward_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const int64_t max_segment_length_per_warp,
                    const Tensor & grad_offsets,
                    const Tensor & total_L_offsets,
                    const int32_t fixed_L_per_warp,
                    const int32_t num_warps_per_feature,
                    const bool permute_output_dim_0_1 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__batch__index__select__dim0__backward__kernel__cta_8cu.html b/gen__batch__index__select__dim0__backward__kernel__cta_8cu.html new file mode 100644 index 000000000..3c0f345ee --- /dev/null +++ b/gen__batch__index__select__dim0__backward__kernel__cta_8cu.html @@ -0,0 +1,965 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_batch_index_select_dim0_backward_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_batch_index_select_dim0_backward_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets,
                    const bool permute_output_dim_0_1 )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets,
                    const bool permute_output_dim_0_1 )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets,
                    const bool permute_output_dim_0_1 )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void batch_index_select_dim0_codegen_backward_kernel_cta_per_row< at template __global__ __launch_bounds__(kMaxThreads) void batch_index_select_dim0_codegen_backward_kernel_cta_per_row< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets,
                    const bool permute_output_dim_0_1 )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets,
                    const bool permute_output_dim_0_1 )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets,
                    const bool permute_output_dim_0_1 )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets,
                    const bool permute_output_dim_0_1 )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets,
                    const bool permute_output_dim_0_1 )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__batch__index__select__dim0__backward__kernel__warp_8cu.html b/gen__batch__index__select__dim0__backward__kernel__warp_8cu.html new file mode 100644 index 000000000..9abb18bbc --- /dev/null +++ b/gen__batch__index__select__dim0__backward__kernel__warp_8cu.html @@ -0,0 +1,461 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_batch_index_select_dim0_backward_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_batch_index_select_dim0_backward_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets,
                    const bool permute_output_dim_0_1 )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void batch_index_select_dim0_codegen_backward_kernel_warp_per_row< at template __global__ __launch_bounds__(kBackwardMaxThreads) void batch_index_select_dim0_codegen_backward_kernel_warp_per_row< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets,
                    const bool permute_output_dim_0_1 )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets,
                    const bool permute_output_dim_0_1 )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets,
                    const bool permute_output_dim_0_1 )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__batch__index__select__dim0__forward__codegen__cuda_8cu.html b/gen__batch__index__select__dim0__forward__codegen__cuda_8cu.html new file mode 100644 index 000000000..49e918f49 --- /dev/null +++ b/gen__batch__index__select__dim0__forward__codegen__cuda_8cu.html @@ -0,0 +1,261 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_batch_index_select_dim0_forward_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_batch_index_select_dim0_forward_codegen_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_KERNEL_FOR_CACHE_CASE

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_KERNEL_FOR_CACHE_CASE( CACHE_CASE_,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    return __VA_ARGS__(); \
                    +
                    }()
                    +
                    +
                    +
                    + +

                    ◆ DISPATCH_OPTIMAL_FORWARD_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_FORWARD_KERNEL( MAX_D_,
                    ... )
                    +
                    + +
                    +
                    + +

                    ◆ DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL( DD_,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    if (DD_ <= 4) { \
                    +
                    constexpr int kEmbeddingSize = 4; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 8) { \
                    +
                    constexpr int kEmbeddingSize = 8; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 16) { \
                    +
                    constexpr int kEmbeddingSize = 16; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 32) { \
                    +
                    constexpr int kEmbeddingSize = 32; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    return; \
                    +
                    }()
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ batch_index_select_dim0_codegen_forward_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor batch_index_select_dim0_codegen_forward_cuda (const Tensor & dev_weights,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const int64_t output_dtype,
                    const Tensor & output_offsets,
                    const Tensor & total_L_offsets,
                    const int64_t output_size,
                    const int32_t fixed_L_per_warp,
                    const int32_t num_warps_per_feature,
                    const bool permute_output_dim_0_1 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__batch__index__select__dim0__forward__kernel_8cu.html b/gen__batch__index__select__dim0__forward__kernel_8cu.html new file mode 100644 index 000000000..62ae31cfc --- /dev/null +++ b/gen__batch__index__select__dim0__forward__kernel_8cu.html @@ -0,0 +1,549 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_batch_index_select_dim0_forward_kernel.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_batch_index_select_dim0_forward_kernel.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , typename index_t , size_t kThreadGroupSize>
                    + + + + + + + +
                    template __launch_bounds__ (kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __launch_bounds__ (kForwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > total_L_offsets,
                    const int32_t fixed_L_per_warp,
                    const bool permute_output_dim_0_1,
                    pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > total_L_offsets,
                    const int32_t fixed_L_per_warp,
                    const bool permute_output_dim_0_1,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > total_L_offsets,
                    const int32_t fixed_L_per_warp,
                    const bool permute_output_dim_0_1,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > total_L_offsets,
                    const int32_t fixed_L_per_warp,
                    const bool permute_output_dim_0_1,
                    pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > total_L_offsets,
                    const int32_t fixed_L_per_warp,
                    const bool permute_output_dim_0_1,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > total_L_offsets,
                    const int32_t fixed_L_per_warp,
                    const bool permute_output_dim_0_1,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template float
                    +
                    + +
                    +
                    + +

                    ◆ int64_t

                    + +
                    +
                    + + + + +
                    template int64_t
                    +
                    + +
                    +
                    + +

                    ◆ uint8_t

                    + +
                    +
                    + + + + +
                    template uint8_t
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__batch__index__select__dim0__forward__kernel__small_8cu.html b/gen__batch__index__select__dim0__forward__kernel__small_8cu.html new file mode 100644 index 000000000..26111d922 --- /dev/null +++ b/gen__batch__index__select__dim0__forward__kernel__small_8cu.html @@ -0,0 +1,323 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_batch_index_select_dim0_forward_kernel_small.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_batch_index_select_dim0_forward_kernel_small.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , typename index_t , size_t kThreadGroupSize>
                    + + + + + + + +
                    __launch_bounds__ (kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __launch_bounds__ (kForwardMaxThreads )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ D_offsets

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets
                    +
                    + +
                    +
                    + +

                    ◆ dev_weights

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights
                    +
                    + +
                    +
                    + +

                    ◆ fd_B

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > int64_t FixedDivisor fd_B
                    +
                    + +
                    +
                    + +

                    ◆ fixed_L_per_warp

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const int32_t fixed_L_per_warp
                    +
                    + +
                    +
                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template float
                    +
                    + +
                    +
                    + +

                    ◆ indices

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > int64_t FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices
                    +
                    + +
                    +
                    + +

                    ◆ int64_t

                    + +
                    +
                    + + + + +
                    template int64_t
                    +
                    + +
                    +
                    + +

                    ◆ output

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > int64_t FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output
                    +
                    + +
                    +
                    + +

                    ◆ output_offsets

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets
                    +
                    + +
                    +
                    + +

                    ◆ permute_output_dim_0_1

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const int32_t const bool permute_output_dim_0_1
                    +
                    + +
                    +
                    + +

                    ◆ total_L_offsets

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > total_L_offsets
                    +
                    + +
                    +
                    + +

                    ◆ uint8_t

                    + +
                    +
                    + + + + +
                    template uint8_t
                    +
                    + +
                    +
                    + +

                    ◆ weights_offsets

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__adagrad__split__cpu_8cpp.html b/gen__embedding__backward__adagrad__split__cpu_8cpp.html new file mode 100644 index 000000000..43e4a3dbb --- /dev/null +++ b/gen__embedding__backward__adagrad__split__cpu_8cpp.html @@ -0,0 +1,252 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_adagrad_split_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_backward_adagrad_split_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <map>
                    +#include <tuple>
                    +#include <utility>
                    +#include <ATen/ATen.h>
                    +#include <ATen/AccumulateType.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm/FbgemmEmbedding.h"
                    +#include "fbgemm/Types.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/cpu_utils.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +
                    + + + +

                    +Namespaces

                    namespace  internal
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_adagrad_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    void split_embedding_backward_codegen_adagrad_cpu (Tensor grad_output,
                    Tensor host_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor D_offsets,
                    int64_t max_D,
                    Tensor hash_size_cumsum,
                    int64_t total_hash_size_bits,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    Tensor indice_weights,
                    bool stochastic_rounding,
                    Tensor momentum1_host,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    int64_t output_dtype = static_cast<int64_t>(SparseType::FP32) )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__adagrad__split__unweighted__cuda_8cu.html b/gen__embedding__backward__adagrad__split__unweighted__cuda_8cu.html new file mode 100644 index 000000000..e5f11dbe0 --- /dev/null +++ b/gen__embedding__backward__adagrad__split__unweighted__cuda_8cu.html @@ -0,0 +1,297 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_adagrad_split_unweighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_adagrad_split_unweighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_adagrad_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_adagrad_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps,
                    double learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html b/gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html new file mode 100644 index 000000000..a5d8efa2f --- /dev/null +++ b/gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html @@ -0,0 +1,1486 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adagrad_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adagrad_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html b/gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html new file mode 100644 index 000000000..7dd84e282 --- /dev/null +++ b/gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html @@ -0,0 +1,1286 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adagrad_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adagrad_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__adagrad__split__unweighted__nobag__cuda_8cu.html b/gen__embedding__backward__adagrad__split__unweighted__nobag__cuda_8cu.html new file mode 100644 index 000000000..ed7ca8567 --- /dev/null +++ b/gen__embedding__backward__adagrad__split__unweighted__nobag__cuda_8cu.html @@ -0,0 +1,287 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_adagrad_split_unweighted_nobag_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_adagrad_split_unweighted_nobag_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_nobag_backward_codegen_adagrad_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_adagrad_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps,
                    double learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html b/gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html new file mode 100644 index 000000000..a51823052 --- /dev/null +++ b/gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html @@ -0,0 +1,1406 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_adagrad_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_adagrad_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html b/gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html new file mode 100644 index 000000000..e92adad27 --- /dev/null +++ b/gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html @@ -0,0 +1,1206 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_adagrad_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_adagrad_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__adagrad__split__weighted__cuda_8cu.html b/gen__embedding__backward__adagrad__split__weighted__cuda_8cu.html new file mode 100644 index 000000000..c60d273e7 --- /dev/null +++ b/gen__embedding__backward__adagrad__split__weighted__cuda_8cu.html @@ -0,0 +1,302 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_adagrad_split_weighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_adagrad_split_weighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_adagrad_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_adagrad_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps,
                    double learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html b/gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html new file mode 100644 index 000000000..553b84956 --- /dev/null +++ b/gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html @@ -0,0 +1,1526 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adagrad_weighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adagrad_weighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html b/gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html new file mode 100644 index 000000000..e577c3f3b --- /dev/null +++ b/gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html @@ -0,0 +1,1326 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adagrad_weighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adagrad_weighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__adam__split__unweighted__cuda_8cu.html b/gen__embedding__backward__adam__split__unweighted__cuda_8cu.html new file mode 100644 index 000000000..6751abb16 --- /dev/null +++ b/gen__embedding__backward__adam__split__unweighted__cuda_8cu.html @@ -0,0 +1,343 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_adam_split_unweighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_adam_split_unweighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    constexpr auto kMaxVecsPerThread = 8; \
                    +
                    constexpr auto kThreadGroupSize = kWarpSize; \
                    +
                    return __VA_ARGS__(); \
                    +
                    }()
                    +
                    template __global__ kWarpSize
                    Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1952
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_adam_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_adam_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate,
                    double eps,
                    double beta1,
                    double beta2,
                    double weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html b/gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html new file mode 100644 index 000000000..9bc1a0620 --- /dev/null +++ b/gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html @@ -0,0 +1,1806 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_adam_split_unweighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_adam_split_unweighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adam_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adam_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html b/gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html new file mode 100644 index 000000000..8f0cdd6d6 --- /dev/null +++ b/gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html @@ -0,0 +1,1606 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_adam_split_unweighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_adam_split_unweighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adam_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adam_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__adam__split__unweighted__nobag__cuda_8cu.html b/gen__embedding__backward__adam__split__unweighted__nobag__cuda_8cu.html new file mode 100644 index 000000000..d15f1f60b --- /dev/null +++ b/gen__embedding__backward__adam__split__unweighted__nobag__cuda_8cu.html @@ -0,0 +1,333 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_adam_split_unweighted_nobag_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_adam_split_unweighted_nobag_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    constexpr auto kMaxVecsPerThread = 8; \
                    +
                    constexpr auto kThreadGroupSize = kWarpSize; \
                    +
                    return __VA_ARGS__(); \
                    +
                    }()
                    +
                    template __global__ kWarpSize
                    Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1952
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_nobag_backward_codegen_adam_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_adam_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate,
                    double eps,
                    double beta1,
                    double beta2,
                    double weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html b/gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html new file mode 100644 index 000000000..c1550d0de --- /dev/null +++ b/gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html @@ -0,0 +1,1726 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_adam_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_adam_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html b/gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html new file mode 100644 index 000000000..d18270f64 --- /dev/null +++ b/gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html @@ -0,0 +1,1526 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_adam_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_adam_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__adam__split__weighted__cuda_8cu.html b/gen__embedding__backward__adam__split__weighted__cuda_8cu.html new file mode 100644 index 000000000..198b8b870 --- /dev/null +++ b/gen__embedding__backward__adam__split__weighted__cuda_8cu.html @@ -0,0 +1,348 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_adam_split_weighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_adam_split_weighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    constexpr auto kMaxVecsPerThread = 8; \
                    +
                    constexpr auto kThreadGroupSize = kWarpSize; \
                    +
                    return __VA_ARGS__(); \
                    +
                    }()
                    +
                    template __global__ kWarpSize
                    Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1952
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_adam_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_adam_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate,
                    double eps,
                    double beta1,
                    double beta2,
                    double weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html b/gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html new file mode 100644 index 000000000..b2c0379b9 --- /dev/null +++ b/gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html @@ -0,0 +1,1846 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_adam_split_weighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_adam_split_weighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adam_weighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adam_weighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html b/gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html new file mode 100644 index 000000000..a72590df8 --- /dev/null +++ b/gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html @@ -0,0 +1,1646 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_adam_split_weighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_adam_split_weighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adam_weighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adam_weighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html b/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html new file mode 100644 index 000000000..6cc389dcf --- /dev/null +++ b/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html @@ -0,0 +1,307 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps,
                    double learning_rate,
                    double weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html b/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html new file mode 100644 index 000000000..2afec6f17 --- /dev/null +++ b/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html @@ -0,0 +1,1566 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html b/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html new file mode 100644 index 000000000..a699e98d9 --- /dev/null +++ b/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html @@ -0,0 +1,1366 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html b/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html new file mode 100644 index 000000000..45c0c2405 --- /dev/null +++ b/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html @@ -0,0 +1,297 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps,
                    double learning_rate,
                    double weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html b/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html new file mode 100644 index 000000000..1bf6d600a --- /dev/null +++ b/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html @@ -0,0 +1,1486 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html b/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html new file mode 100644 index 000000000..725e01903 --- /dev/null +++ b/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html @@ -0,0 +1,1286 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html b/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html new file mode 100644 index 000000000..d76213cba --- /dev/null +++ b/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html @@ -0,0 +1,312 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps,
                    double learning_rate,
                    double weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html b/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html new file mode 100644 index 000000000..2caf2a091 --- /dev/null +++ b/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html @@ -0,0 +1,1606 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html b/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html new file mode 100644 index 000000000..42c14ae0a --- /dev/null +++ b/gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html @@ -0,0 +1,1406 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__dense__indice__weights__codegen__cuda_8cu.html b/gen__embedding__backward__dense__indice__weights__codegen__cuda_8cu.html new file mode 100644 index 000000000..cc943eacb --- /dev/null +++ b/gen__embedding__backward__dense__indice__weights__codegen__cuda_8cu.html @@ -0,0 +1,196 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_dense_indice_weights_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_dense_indice_weights_codegen_cuda.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__()

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ dense_embedding_codegen_grad_indice_weights_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor dense_embedding_codegen_grad_indice_weights_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & feature_requires_grad )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__dense__split__cpu_8cpp.html b/gen__embedding__backward__dense__split__cpu_8cpp.html new file mode 100644 index 000000000..c9d225761 --- /dev/null +++ b/gen__embedding__backward__dense__split__cpu_8cpp.html @@ -0,0 +1,217 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_dense_split_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_backward_dense_split_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <map>
                    +#include <tuple>
                    +#include <utility>
                    +#include <ATen/ATen.h>
                    +#include <ATen/AccumulateType.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm/FbgemmEmbedding.h"
                    +#include "fbgemm/Types.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/cpu_utils.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +
                    + + + +

                    +Namespaces

                    namespace  internal
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_dense_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_dense_cpu (Tensor grad_output,
                    Tensor host_weights,
                    Tensor weights_offsets,
                    Tensor D_offsets,
                    int64_t max_D,
                    Tensor hash_size_cumsum,
                    int64_t total_hash_size_bits,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    Tensor indice_weights,
                    double unused = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__dense__split__unweighted__cuda_8cu.html b/gen__embedding__backward__dense__split__unweighted__cuda_8cu.html new file mode 100644 index 000000000..32c32e792 --- /dev/null +++ b/gen__embedding__backward__dense__split__unweighted__cuda_8cu.html @@ -0,0 +1,227 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_dense_split_unweighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_dense_split_unweighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_dense_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_dense_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    double unused )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html b/gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html new file mode 100644 index 000000000..1893c7903 --- /dev/null +++ b/gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html @@ -0,0 +1,1005 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_dense_split_unweighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_dense_split_unweighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_dense_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_dense_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__dense__split__unweighted__kernel__warp_8cu.html b/gen__embedding__backward__dense__split__unweighted__kernel__warp_8cu.html new file mode 100644 index 000000000..041c43b6a --- /dev/null +++ b/gen__embedding__backward__dense__split__unweighted__kernel__warp_8cu.html @@ -0,0 +1,481 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_dense_split_unweighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_dense_split_unweighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_dense_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_dense_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float unused )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__dense__split__unweighted__nobag__cuda_8cu.html b/gen__embedding__backward__dense__split__unweighted__nobag__cuda_8cu.html new file mode 100644 index 000000000..0684bf103 --- /dev/null +++ b/gen__embedding__backward__dense__split__unweighted__nobag__cuda_8cu.html @@ -0,0 +1,217 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_dense_split_unweighted_nobag_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_dense_split_unweighted_nobag_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_nobag_backward_codegen_dense_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_dense_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    double unused )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html b/gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html new file mode 100644 index 000000000..d43aec63f --- /dev/null +++ b/gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html @@ -0,0 +1,925 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_dense_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_dense_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__dense__split__unweighted__nobag__kernel__warp_8cu.html b/gen__embedding__backward__dense__split__unweighted__nobag__kernel__warp_8cu.html new file mode 100644 index 000000000..8227a0bd3 --- /dev/null +++ b/gen__embedding__backward__dense__split__unweighted__nobag__kernel__warp_8cu.html @@ -0,0 +1,441 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_dense_split_unweighted_nobag_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_dense_split_unweighted_nobag_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_dense_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_dense_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    float unused )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__dense__split__weighted__cuda_8cu.html b/gen__embedding__backward__dense__split__weighted__cuda_8cu.html new file mode 100644 index 000000000..0e0eb9db0 --- /dev/null +++ b/gen__embedding__backward__dense__split__weighted__cuda_8cu.html @@ -0,0 +1,232 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_dense_split_weighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_dense_split_weighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_dense_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_dense_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    double unused )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html b/gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html new file mode 100644 index 000000000..346eed047 --- /dev/null +++ b/gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html @@ -0,0 +1,1045 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_dense_split_weighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_dense_split_weighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_dense_weighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_dense_weighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float unused )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html b/gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html new file mode 100644 index 000000000..d7d92900a --- /dev/null +++ b/gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html @@ -0,0 +1,845 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_dense_split_weighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_dense_split_weighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_dense_weighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_dense_weighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float unused )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float unused )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__lamb__split__unweighted__cuda_8cu.html b/gen__embedding__backward__lamb__split__unweighted__cuda_8cu.html new file mode 100644 index 000000000..7a90c6b0d --- /dev/null +++ b/gen__embedding__backward__lamb__split__unweighted__cuda_8cu.html @@ -0,0 +1,343 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_lamb_split_unweighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_lamb_split_unweighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    constexpr auto kMaxVecsPerThread = 8; \
                    +
                    constexpr auto kThreadGroupSize = kWarpSize; \
                    +
                    return __VA_ARGS__(); \
                    +
                    }()
                    +
                    template __global__ kWarpSize
                    Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1952
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_lamb_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_lamb_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate,
                    double eps,
                    double beta1,
                    double beta2,
                    double weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html b/gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html new file mode 100644 index 000000000..4e180036a --- /dev/null +++ b/gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html @@ -0,0 +1,1806 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lamb_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lamb_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html b/gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html new file mode 100644 index 000000000..5df9dedb2 --- /dev/null +++ b/gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html @@ -0,0 +1,1606 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lamb_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lamb_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__lamb__split__unweighted__nobag__cuda_8cu.html b/gen__embedding__backward__lamb__split__unweighted__nobag__cuda_8cu.html new file mode 100644 index 000000000..102a1ab4b --- /dev/null +++ b/gen__embedding__backward__lamb__split__unweighted__nobag__cuda_8cu.html @@ -0,0 +1,333 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_lamb_split_unweighted_nobag_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_lamb_split_unweighted_nobag_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    constexpr auto kMaxVecsPerThread = 8; \
                    +
                    constexpr auto kThreadGroupSize = kWarpSize; \
                    +
                    return __VA_ARGS__(); \
                    +
                    }()
                    +
                    template __global__ kWarpSize
                    Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1952
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_nobag_backward_codegen_lamb_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_lamb_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate,
                    double eps,
                    double beta1,
                    double beta2,
                    double weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html b/gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html new file mode 100644 index 000000000..382252f56 --- /dev/null +++ b/gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html @@ -0,0 +1,1726 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_lamb_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_lamb_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html b/gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html new file mode 100644 index 000000000..c374fa666 --- /dev/null +++ b/gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html @@ -0,0 +1,1526 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_lamb_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_lamb_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__lamb__split__weighted__cuda_8cu.html b/gen__embedding__backward__lamb__split__weighted__cuda_8cu.html new file mode 100644 index 000000000..caba003e1 --- /dev/null +++ b/gen__embedding__backward__lamb__split__weighted__cuda_8cu.html @@ -0,0 +1,348 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_lamb_split_weighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_lamb_split_weighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    constexpr auto kMaxVecsPerThread = 8; \
                    +
                    constexpr auto kThreadGroupSize = kWarpSize; \
                    +
                    return __VA_ARGS__(); \
                    +
                    }()
                    +
                    template __global__ kWarpSize
                    Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1952
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_lamb_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_lamb_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate,
                    double eps,
                    double beta1,
                    double beta2,
                    double weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html b/gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html new file mode 100644 index 000000000..320f39543 --- /dev/null +++ b/gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html @@ -0,0 +1,1846 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_lamb_split_weighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_lamb_split_weighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lamb_weighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lamb_weighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html b/gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html new file mode 100644 index 000000000..48c4893db --- /dev/null +++ b/gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html @@ -0,0 +1,1646 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_lamb_split_weighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_lamb_split_weighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lamb_weighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lamb_weighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__lars__sgd__split__unweighted__cuda_8cu.html b/gen__embedding__backward__lars__sgd__split__unweighted__cuda_8cu.html new file mode 100644 index 000000000..dd02fee55 --- /dev/null +++ b/gen__embedding__backward__lars__sgd__split__unweighted__cuda_8cu.html @@ -0,0 +1,313 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_lars_sgd_split_unweighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_lars_sgd_split_unweighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    constexpr auto kMaxVecsPerThread = 8; \
                    +
                    constexpr auto kThreadGroupSize = kWarpSize; \
                    +
                    return __VA_ARGS__(); \
                    +
                    }()
                    +
                    template __global__ kWarpSize
                    Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1952
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_lars_sgd_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_lars_sgd_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double learning_rate,
                    double eta,
                    double momentum,
                    double weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html b/gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html new file mode 100644 index 000000000..ec72101fe --- /dev/null +++ b/gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html @@ -0,0 +1,1566 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lars_sgd_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lars_sgd_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html b/gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html new file mode 100644 index 000000000..3ff78496f --- /dev/null +++ b/gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html @@ -0,0 +1,1366 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lars_sgd_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lars_sgd_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__lars__sgd__split__unweighted__nobag__cuda_8cu.html b/gen__embedding__backward__lars__sgd__split__unweighted__nobag__cuda_8cu.html new file mode 100644 index 000000000..d26a6ec10 --- /dev/null +++ b/gen__embedding__backward__lars__sgd__split__unweighted__nobag__cuda_8cu.html @@ -0,0 +1,303 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_lars_sgd_split_unweighted_nobag_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_lars_sgd_split_unweighted_nobag_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    constexpr auto kMaxVecsPerThread = 8; \
                    +
                    constexpr auto kThreadGroupSize = kWarpSize; \
                    +
                    return __VA_ARGS__(); \
                    +
                    }()
                    +
                    template __global__ kWarpSize
                    Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1952
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_nobag_backward_codegen_lars_sgd_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_lars_sgd_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double learning_rate,
                    double eta,
                    double momentum,
                    double weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html b/gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html new file mode 100644 index 000000000..810e247ae --- /dev/null +++ b/gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html @@ -0,0 +1,1486 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_lars_sgd_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_lars_sgd_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html b/gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html new file mode 100644 index 000000000..57f116893 --- /dev/null +++ b/gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html @@ -0,0 +1,1286 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_lars_sgd_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_lars_sgd_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__lars__sgd__split__weighted__cuda_8cu.html b/gen__embedding__backward__lars__sgd__split__weighted__cuda_8cu.html new file mode 100644 index 000000000..105935b95 --- /dev/null +++ b/gen__embedding__backward__lars__sgd__split__weighted__cuda_8cu.html @@ -0,0 +1,318 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_lars_sgd_split_weighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_lars_sgd_split_weighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    constexpr auto kMaxVecsPerThread = 8; \
                    +
                    constexpr auto kThreadGroupSize = kWarpSize; \
                    +
                    return __VA_ARGS__(); \
                    +
                    }()
                    +
                    template __global__ kWarpSize
                    Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1952
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_lars_sgd_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_lars_sgd_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double learning_rate,
                    double eta,
                    double momentum,
                    double weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html b/gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html new file mode 100644 index 000000000..403d0734f --- /dev/null +++ b/gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html @@ -0,0 +1,1606 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lars_sgd_weighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lars_sgd_weighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html b/gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html new file mode 100644 index 000000000..dc9794c0d --- /dev/null +++ b/gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html @@ -0,0 +1,1406 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lars_sgd_weighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lars_sgd_weighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float learning_rate,
                    float eta,
                    float momentum,
                    float weight_decay )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__none__split__unweighted__cuda_8cu.html b/gen__embedding__backward__none__split__unweighted__cuda_8cu.html new file mode 100644 index 000000000..9f53ccc5d --- /dev/null +++ b/gen__embedding__backward__none__split__unweighted__cuda_8cu.html @@ -0,0 +1,272 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_none_split_unweighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_none_split_unweighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_none_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_none_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html b/gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html new file mode 100644 index 000000000..251286437 --- /dev/null +++ b/gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html @@ -0,0 +1,1165 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_none_split_unweighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_none_split_unweighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_none_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_none_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__none__split__unweighted__kernel__warp_8cu.html b/gen__embedding__backward__none__split__unweighted__kernel__warp_8cu.html new file mode 100644 index 000000000..bc98cf9d0 --- /dev/null +++ b/gen__embedding__backward__none__split__unweighted__kernel__warp_8cu.html @@ -0,0 +1,561 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_none_split_unweighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_none_split_unweighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_none_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_none_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__none__split__unweighted__nobag__cuda_8cu.html b/gen__embedding__backward__none__split__unweighted__nobag__cuda_8cu.html new file mode 100644 index 000000000..d3674f644 --- /dev/null +++ b/gen__embedding__backward__none__split__unweighted__nobag__cuda_8cu.html @@ -0,0 +1,262 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_none_split_unweighted_nobag_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_none_split_unweighted_nobag_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_nobag_backward_codegen_none_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_none_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html b/gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html new file mode 100644 index 000000000..6e77503d3 --- /dev/null +++ b/gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html @@ -0,0 +1,1085 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_none_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_none_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__none__split__unweighted__nobag__kernel__warp_8cu.html b/gen__embedding__backward__none__split__unweighted__nobag__kernel__warp_8cu.html new file mode 100644 index 000000000..571f34906 --- /dev/null +++ b/gen__embedding__backward__none__split__unweighted__nobag__kernel__warp_8cu.html @@ -0,0 +1,521 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_none_split_unweighted_nobag_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_none_split_unweighted_nobag_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_none_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_none_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/4]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__none__split__weighted__cuda_8cu.html b/gen__embedding__backward__none__split__weighted__cuda_8cu.html new file mode 100644 index 000000000..492c761c1 --- /dev/null +++ b/gen__embedding__backward__none__split__weighted__cuda_8cu.html @@ -0,0 +1,277 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_none_split_weighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_none_split_weighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_none_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_none_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html b/gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html new file mode 100644 index 000000000..4f70c3110 --- /dev/null +++ b/gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html @@ -0,0 +1,1205 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_none_split_weighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_none_split_weighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_none_weighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_none_weighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html b/gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html new file mode 100644 index 000000000..c890b3786 --- /dev/null +++ b/gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html @@ -0,0 +1,1005 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_none_split_weighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_none_split_weighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_none_weighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_none_weighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights,
                    const int32_t max_D,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    int64_t total_hash_size,
                    int64_t total_unique_indices )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__partial__rowwise__adam__split__unweighted__cuda_8cu.html b/gen__embedding__backward__partial__rowwise__adam__split__unweighted__cuda_8cu.html new file mode 100644 index 000000000..00bf2d8ec --- /dev/null +++ b/gen__embedding__backward__partial__rowwise__adam__split__unweighted__cuda_8cu.html @@ -0,0 +1,337 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_partial_rowwise_adam_split_unweighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_partial_rowwise_adam_split_unweighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate,
                    double eps,
                    double beta1,
                    double beta2,
                    double weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html b/gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html new file mode 100644 index 000000000..a52d47c2a --- /dev/null +++ b/gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html @@ -0,0 +1,1806 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html b/gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html new file mode 100644 index 000000000..d4ecf8ce2 --- /dev/null +++ b/gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html @@ -0,0 +1,1606 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__cuda_8cu.html b/gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__cuda_8cu.html new file mode 100644 index 000000000..d34a21324 --- /dev/null +++ b/gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__cuda_8cu.html @@ -0,0 +1,327 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate,
                    double eps,
                    double beta1,
                    double beta2,
                    double weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html b/gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html new file mode 100644 index 000000000..d95d288dc --- /dev/null +++ b/gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html @@ -0,0 +1,1726 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html b/gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html new file mode 100644 index 000000000..b93dc0507 --- /dev/null +++ b/gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html @@ -0,0 +1,1526 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__partial__rowwise__adam__split__weighted__cuda_8cu.html b/gen__embedding__backward__partial__rowwise__adam__split__weighted__cuda_8cu.html new file mode 100644 index 000000000..3d91fcc9b --- /dev/null +++ b/gen__embedding__backward__partial__rowwise__adam__split__weighted__cuda_8cu.html @@ -0,0 +1,342 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_partial_rowwise_adam_split_weighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_partial_rowwise_adam_split_weighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_partial_rowwise_adam_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_partial_rowwise_adam_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate,
                    double eps,
                    double beta1,
                    double beta2,
                    double weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html b/gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html new file mode 100644 index 000000000..80c6786ac --- /dev/null +++ b/gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html @@ -0,0 +1,1846 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_weighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_weighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html b/gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html new file mode 100644 index 000000000..02c808573 --- /dev/null +++ b/gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html @@ -0,0 +1,1646 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_weighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_weighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__partial__rowwise__lamb__split__unweighted__cuda_8cu.html b/gen__embedding__backward__partial__rowwise__lamb__split__unweighted__cuda_8cu.html new file mode 100644 index 000000000..07ec14712 --- /dev/null +++ b/gen__embedding__backward__partial__rowwise__lamb__split__unweighted__cuda_8cu.html @@ -0,0 +1,337 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_partial_rowwise_lamb_split_unweighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_partial_rowwise_lamb_split_unweighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate,
                    double eps,
                    double beta1,
                    double beta2,
                    double weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html b/gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html new file mode 100644 index 000000000..049733cba --- /dev/null +++ b/gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html @@ -0,0 +1,1806 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html b/gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html new file mode 100644 index 000000000..a861a770b --- /dev/null +++ b/gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html @@ -0,0 +1,1606 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__cuda_8cu.html b/gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__cuda_8cu.html new file mode 100644 index 000000000..2814f8526 --- /dev/null +++ b/gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__cuda_8cu.html @@ -0,0 +1,327 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate,
                    double eps,
                    double beta1,
                    double beta2,
                    double weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html b/gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html new file mode 100644 index 000000000..e9330acb8 --- /dev/null +++ b/gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html @@ -0,0 +1,1726 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html b/gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html new file mode 100644 index 000000000..6b2eadfa1 --- /dev/null +++ b/gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html @@ -0,0 +1,1526 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__partial__rowwise__lamb__split__weighted__cuda_8cu.html b/gen__embedding__backward__partial__rowwise__lamb__split__weighted__cuda_8cu.html new file mode 100644 index 000000000..49556318e --- /dev/null +++ b/gen__embedding__backward__partial__rowwise__lamb__split__weighted__cuda_8cu.html @@ -0,0 +1,342 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_partial_rowwise_lamb_split_weighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_partial_rowwise_lamb_split_weighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_partial_rowwise_lamb_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_partial_rowwise_lamb_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate,
                    double eps,
                    double beta1,
                    double beta2,
                    double weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html b/gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html new file mode 100644 index 000000000..11065c374 --- /dev/null +++ b/gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html @@ -0,0 +1,1846 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_weighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_weighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html b/gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html new file mode 100644 index 000000000..0ba7b5939 --- /dev/null +++ b/gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html @@ -0,0 +1,1646 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_weighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_weighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets,
                    float learning_rate,
                    float eps,
                    float beta1,
                    float beta2,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__split__cpu_8cpp.html b/gen__embedding__backward__rowwise__adagrad__split__cpu_8cpp.html new file mode 100644 index 000000000..490ad0a82 --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__split__cpu_8cpp.html @@ -0,0 +1,267 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_split_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_backward_rowwise_adagrad_split_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <map>
                    +#include <tuple>
                    +#include <utility>
                    +#include <ATen/ATen.h>
                    +#include <ATen/AccumulateType.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm/FbgemmEmbedding.h"
                    +#include "fbgemm/Types.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/cpu_utils.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +
                    + + + +

                    +Namespaces

                    namespace  internal
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    void split_embedding_backward_codegen_rowwise_adagrad_cpu (Tensor grad_output,
                    Tensor host_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor D_offsets,
                    int64_t max_D,
                    Tensor hash_size_cumsum,
                    int64_t total_hash_size_bits,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    Tensor indice_weights,
                    bool stochastic_rounding,
                    Tensor momentum1_host,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t weight_decay_mode = 0,
                    double max_norm = 0.0,
                    int64_t output_dtype = static_cast<int64_t>(SparseType::FP32) )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__split__unweighted__cuda_8cu.html b/gen__embedding__backward__rowwise__adagrad__split__unweighted__cuda_8cu.html new file mode 100644 index 000000000..c9ce0e4ba --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__split__unweighted__cuda_8cu.html @@ -0,0 +1,312 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_split_unweighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_split_unweighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps,
                    double learning_rate,
                    double weight_decay,
                    int64_t weight_decay_mode,
                    double max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html b/gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html new file mode 100644 index 000000000..986688a4f --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html @@ -0,0 +1,1606 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html b/gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html new file mode 100644 index 000000000..63de39288 --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html @@ -0,0 +1,1406 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__cuda_8cu.html b/gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__cuda_8cu.html new file mode 100644 index 000000000..e058e5964 --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__cuda_8cu.html @@ -0,0 +1,302 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps,
                    double learning_rate,
                    double weight_decay,
                    int64_t weight_decay_mode,
                    double max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html b/gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html new file mode 100644 index 000000000..996466965 --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html @@ -0,0 +1,1526 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html b/gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html new file mode 100644 index 000000000..7c0e2d12d --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html @@ -0,0 +1,1326 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__cuda_8cu.html b/gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__cuda_8cu.html new file mode 100644 index 000000000..f170f5a9d --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__cuda_8cu.html @@ -0,0 +1,327 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_vbe_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_vbe_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const Tensor & B_offsets,
                    const Tensor & vbe_row_output_offsets,
                    const Tensor & vbe_b_t_map,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps,
                    double learning_rate,
                    double weight_decay,
                    int64_t weight_decay_mode,
                    double max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html b/gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html new file mode 100644 index 000000000..38ddbe93d --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html @@ -0,0 +1,1686 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_vbe_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_vbe_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html b/gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html new file mode 100644 index 000000000..07d8d8c35 --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html @@ -0,0 +1,1486 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_vbe_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_vbe_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__split__weighted__cuda_8cu.html b/gen__embedding__backward__rowwise__adagrad__split__weighted__cuda_8cu.html new file mode 100644 index 000000000..72e7fda41 --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__split__weighted__cuda_8cu.html @@ -0,0 +1,317 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_split_weighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_split_weighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps,
                    double learning_rate,
                    double weight_decay,
                    int64_t weight_decay_mode,
                    double max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html b/gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html new file mode 100644 index 000000000..7b4919278 --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html @@ -0,0 +1,1646 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html b/gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html new file mode 100644 index 000000000..df1e2d55b --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html @@ -0,0 +1,1446 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__cuda_8cu.html b/gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__cuda_8cu.html new file mode 100644 index 000000000..98d366a28 --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__cuda_8cu.html @@ -0,0 +1,332 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_vbe_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_vbe_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const Tensor & B_offsets,
                    const Tensor & vbe_row_output_offsets,
                    const Tensor & vbe_b_t_map,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps,
                    double learning_rate,
                    double weight_decay,
                    int64_t weight_decay_mode,
                    double max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html b/gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html new file mode 100644 index 000000000..67e7d5109 --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html @@ -0,0 +1,1726 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_vbe_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_vbe_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html b/gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html new file mode 100644 index 000000000..b66e0eeb5 --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html @@ -0,0 +1,1526 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_vbe_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_vbe_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode,
                    float max_norm )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__with__counter__split__cpu_8cpp.html b/gen__embedding__backward__rowwise__adagrad__with__counter__split__cpu_8cpp.html new file mode 100644 index 000000000..cd41d5d6c --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__with__counter__split__cpu_8cpp.html @@ -0,0 +1,352 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_with_counter_split_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_backward_rowwise_adagrad_with_counter_split_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <map>
                    +#include <tuple>
                    +#include <utility>
                    +#include <ATen/ATen.h>
                    +#include <ATen/AccumulateType.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm/FbgemmEmbedding.h"
                    +#include "fbgemm/Types.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/cpu_utils.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +
                    + + + +

                    +Namespaces

                    namespace  internal
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_with_counter_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    void split_embedding_backward_codegen_rowwise_adagrad_with_counter_cpu (Tensor grad_output,
                    Tensor host_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor D_offsets,
                    int64_t max_D,
                    Tensor hash_size_cumsum,
                    int64_t total_hash_size_bits,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    Tensor indice_weights,
                    bool stochastic_rounding,
                    Tensor momentum1_host,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor prev_iter_host,
                    Tensor prev_iter_placements,
                    Tensor prev_iter_offsets,
                    Tensor row_counter_host,
                    Tensor row_counter_placements,
                    Tensor row_counter_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t iter = 0,
                    int64_t counter_halflife = -1,
                    int64_t adjustment_iter = -1,
                    double adjustment_ub = 1.0,
                    int64_t learning_rate_mode = -1,
                    int64_t weight_decay_mode = 1,
                    int64_t grad_sum_decay = -1,
                    double max_counter = 0,
                    double tail_id_threshold = 0.0,
                    int64_t is_tail_id_thresh_ratio = 0,
                    int64_t regularization_mode = 0,
                    double weight_norm_coefficient = 0.0,
                    double lower_bound = 0.0,
                    int64_t output_dtype = static_cast<int64_t>(SparseType::FP32) )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__cuda_8cu.html b/gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__cuda_8cu.html new file mode 100644 index 000000000..765938402 --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__cuda_8cu.html @@ -0,0 +1,407 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor prev_iter_dev,
                    Tensor prev_iter_uvm,
                    Tensor prev_iter_placements,
                    Tensor prev_iter_offsets,
                    Tensor row_counter_dev,
                    Tensor row_counter_uvm,
                    Tensor row_counter_placements,
                    Tensor row_counter_offsets,
                    double eps,
                    double learning_rate,
                    double weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    double adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    double max_counter,
                    double tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    double weight_norm_coefficient,
                    double lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html b/gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html new file mode 100644 index 000000000..bc525f8f8 --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html @@ -0,0 +1,2366 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html b/gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html new file mode 100644 index 000000000..87e16a753 --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html @@ -0,0 +1,2166 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__cuda_8cu.html b/gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__cuda_8cu.html new file mode 100644 index 000000000..9449a6d9f --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__cuda_8cu.html @@ -0,0 +1,397 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor prev_iter_dev,
                    Tensor prev_iter_uvm,
                    Tensor prev_iter_placements,
                    Tensor prev_iter_offsets,
                    Tensor row_counter_dev,
                    Tensor row_counter_uvm,
                    Tensor row_counter_placements,
                    Tensor row_counter_offsets,
                    double eps,
                    double learning_rate,
                    double weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    double adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    double max_counter,
                    double tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    double weight_norm_coefficient,
                    double lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html b/gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html new file mode 100644 index 000000000..9c86d32fb --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html @@ -0,0 +1,2286 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html b/gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html new file mode 100644 index 000000000..1fe7d515d --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html @@ -0,0 +1,2086 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__cuda_8cu.html b/gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__cuda_8cu.html new file mode 100644 index 000000000..f5b86f9fc --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__cuda_8cu.html @@ -0,0 +1,412 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor prev_iter_dev,
                    Tensor prev_iter_uvm,
                    Tensor prev_iter_placements,
                    Tensor prev_iter_offsets,
                    Tensor row_counter_dev,
                    Tensor row_counter_uvm,
                    Tensor row_counter_placements,
                    Tensor row_counter_offsets,
                    double eps,
                    double learning_rate,
                    double weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    double adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    double max_counter,
                    double tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    double weight_norm_coefficient,
                    double lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html b/gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html new file mode 100644 index 000000000..67066a1cc --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html @@ -0,0 +1,2406 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html b/gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html new file mode 100644 index 000000000..7db75f89c --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html @@ -0,0 +1,2206 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter,
                    int64_t counter_halflife,
                    int64_t adjustment_iter,
                    float adjustment_ub,
                    int64_t learning_rate_mode,
                    int64_t weight_decay_mode,
                    int64_t grad_sum_decay,
                    float max_counter,
                    float tail_id_threshold,
                    int64_t is_tail_id_thresh_ratio,
                    int64_t regularization_mode,
                    float weight_norm_coefficient,
                    float lower_bound )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html b/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html new file mode 100644 index 000000000..d745457c5 --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html @@ -0,0 +1,307 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps,
                    double learning_rate,
                    double weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html b/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html new file mode 100644 index 000000000..8fb8a4367 --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html @@ -0,0 +1,1566 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html b/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html new file mode 100644 index 000000000..d1127777d --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html @@ -0,0 +1,1366 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html b/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html new file mode 100644 index 000000000..9b2de2249 --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html @@ -0,0 +1,297 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps,
                    double learning_rate,
                    double weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html b/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html new file mode 100644 index 000000000..21b41a03c --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html @@ -0,0 +1,1486 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html b/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html new file mode 100644 index 000000000..e1257c86f --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html @@ -0,0 +1,1286 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html b/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html new file mode 100644 index 000000000..6c9ec43e9 --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html @@ -0,0 +1,312 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps,
                    double learning_rate,
                    double weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html b/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html new file mode 100644 index 000000000..f8a136945 --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html @@ -0,0 +1,1606 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html b/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html new file mode 100644 index 000000000..67a49dc28 --- /dev/null +++ b/gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html @@ -0,0 +1,1406 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t weight_decay_mode )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__weighted__adagrad__split__cpu_8cpp.html b/gen__embedding__backward__rowwise__weighted__adagrad__split__cpu_8cpp.html new file mode 100644 index 000000000..bb0b6b5f0 --- /dev/null +++ b/gen__embedding__backward__rowwise__weighted__adagrad__split__cpu_8cpp.html @@ -0,0 +1,262 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_weighted_adagrad_split_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_backward_rowwise_weighted_adagrad_split_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <map>
                    +#include <tuple>
                    +#include <utility>
                    +#include <ATen/ATen.h>
                    +#include <ATen/AccumulateType.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm/FbgemmEmbedding.h"
                    +#include "fbgemm/Types.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/cpu_utils.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +
                    + + + +

                    +Namespaces

                    namespace  internal
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_weighted_adagrad_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    void split_embedding_backward_codegen_rowwise_weighted_adagrad_cpu (Tensor grad_output,
                    Tensor host_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor D_offsets,
                    int64_t max_D,
                    Tensor hash_size_cumsum,
                    int64_t total_hash_size_bits,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    Tensor indice_weights,
                    bool stochastic_rounding,
                    Tensor momentum1_host,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0,
                    int64_t iter = 0,
                    int64_t output_dtype = static_cast<int64_t>(SparseType::FP32) )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__cuda_8cu.html b/gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__cuda_8cu.html new file mode 100644 index 000000000..5ce603f9b --- /dev/null +++ b/gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__cuda_8cu.html @@ -0,0 +1,313 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    constexpr auto kMaxVecsPerThread = 8; \
                    +
                    constexpr auto kThreadGroupSize = kWarpSize; \
                    +
                    return __VA_ARGS__(); \
                    +
                    }()
                    +
                    template __global__ kWarpSize
                    Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1952
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps,
                    double learning_rate,
                    double weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html b/gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html new file mode 100644 index 000000000..15fc8ea0f --- /dev/null +++ b/gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html @@ -0,0 +1,1566 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html b/gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html new file mode 100644 index 000000000..ae5c22069 --- /dev/null +++ b/gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html @@ -0,0 +1,1366 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__cuda_8cu.html b/gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__cuda_8cu.html new file mode 100644 index 000000000..7f9294ffb --- /dev/null +++ b/gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__cuda_8cu.html @@ -0,0 +1,303 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    constexpr auto kMaxVecsPerThread = 8; \
                    +
                    constexpr auto kThreadGroupSize = kWarpSize; \
                    +
                    return __VA_ARGS__(); \
                    +
                    }()
                    +
                    template __global__ kWarpSize
                    Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1952
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps,
                    double learning_rate,
                    double weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html b/gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html new file mode 100644 index 000000000..09d9071cb --- /dev/null +++ b/gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html @@ -0,0 +1,1486 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html b/gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html new file mode 100644 index 000000000..0057c1f74 --- /dev/null +++ b/gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html @@ -0,0 +1,1286 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__cuda_8cu.html b/gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__cuda_8cu.html new file mode 100644 index 000000000..302e84a0f --- /dev/null +++ b/gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__cuda_8cu.html @@ -0,0 +1,318 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    constexpr auto kMaxVecsPerThread = 8; \
                    +
                    constexpr auto kThreadGroupSize = kWarpSize; \
                    +
                    return __VA_ARGS__(); \
                    +
                    }()
                    +
                    template __global__ kWarpSize
                    Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1952
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps,
                    double learning_rate,
                    double weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html b/gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html new file mode 100644 index 000000000..eeb4b9712 --- /dev/null +++ b/gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html @@ -0,0 +1,1606 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html b/gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html new file mode 100644 index 000000000..afa91175d --- /dev/null +++ b/gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html @@ -0,0 +1,1406 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets,
                    float eps,
                    float learning_rate,
                    float weight_decay,
                    int64_t iter )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__sgd__split__cpu_8cpp.html b/gen__embedding__backward__sgd__split__cpu_8cpp.html new file mode 100644 index 000000000..3fca536c0 --- /dev/null +++ b/gen__embedding__backward__sgd__split__cpu_8cpp.html @@ -0,0 +1,232 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_sgd_split_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_backward_sgd_split_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <map>
                    +#include <tuple>
                    +#include <utility>
                    +#include <ATen/ATen.h>
                    +#include <ATen/AccumulateType.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm/FbgemmEmbedding.h"
                    +#include "fbgemm/Types.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/cpu_utils.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +
                    + + + +

                    +Namespaces

                    namespace  internal
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_sgd_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    void split_embedding_backward_codegen_sgd_cpu (Tensor grad_output,
                    Tensor host_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor D_offsets,
                    int64_t max_D,
                    Tensor hash_size_cumsum,
                    int64_t total_hash_size_bits,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    Tensor indice_weights,
                    bool stochastic_rounding,
                    double learning_rate = 0,
                    int64_t output_dtype = static_cast<int64_t>(SparseType::FP32) )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__sgd__split__unweighted__cuda_8cu.html b/gen__embedding__backward__sgd__split__unweighted__cuda_8cu.html new file mode 100644 index 000000000..fcafcb7c1 --- /dev/null +++ b/gen__embedding__backward__sgd__split__unweighted__cuda_8cu.html @@ -0,0 +1,272 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_sgd_split_unweighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_sgd_split_unweighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_sgd_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_sgd_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    double learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html b/gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html new file mode 100644 index 000000000..e117ecd50 --- /dev/null +++ b/gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html @@ -0,0 +1,1286 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html b/gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html new file mode 100644 index 000000000..2fd79adac --- /dev/null +++ b/gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html @@ -0,0 +1,1086 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__sgd__split__unweighted__nobag__cuda_8cu.html b/gen__embedding__backward__sgd__split__unweighted__nobag__cuda_8cu.html new file mode 100644 index 000000000..305babdf1 --- /dev/null +++ b/gen__embedding__backward__sgd__split__unweighted__nobag__cuda_8cu.html @@ -0,0 +1,262 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_sgd_split_unweighted_nobag_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_sgd_split_unweighted_nobag_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_nobag_backward_codegen_sgd_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_sgd_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    double learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html b/gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html new file mode 100644 index 000000000..16eb98897 --- /dev/null +++ b/gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html @@ -0,0 +1,1206 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_sgd_unweighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_sgd_unweighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html b/gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html new file mode 100644 index 000000000..a96a1a2de --- /dev/null +++ b/gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html @@ -0,0 +1,1006 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_sgd_unweighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_sgd_unweighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    float learning_rate )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__sgd__split__unweighted__vbe__cuda_8cu.html b/gen__embedding__backward__sgd__split__unweighted__vbe__cuda_8cu.html new file mode 100644 index 000000000..3f9c2a5a1 --- /dev/null +++ b/gen__embedding__backward__sgd__split__unweighted__vbe__cuda_8cu.html @@ -0,0 +1,287 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_sgd_split_unweighted_vbe_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_sgd_split_unweighted_vbe_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_sgd_unweighted_exact_vbe_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_sgd_unweighted_exact_vbe_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const Tensor & B_offsets,
                    const Tensor & vbe_row_output_offsets,
                    const Tensor & vbe_b_t_map,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    double learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html b/gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html new file mode 100644 index 000000000..ebae6f0f5 --- /dev/null +++ b/gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html @@ -0,0 +1,1366 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_vbe_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_vbe_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html b/gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html new file mode 100644 index 000000000..c15bae171 --- /dev/null +++ b/gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html @@ -0,0 +1,1166 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_vbe_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_vbe_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__sgd__split__weighted__cuda_8cu.html b/gen__embedding__backward__sgd__split__weighted__cuda_8cu.html new file mode 100644 index 000000000..5d86d13b5 --- /dev/null +++ b/gen__embedding__backward__sgd__split__weighted__cuda_8cu.html @@ -0,0 +1,277 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_sgd_split_weighted_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_sgd_split_weighted_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_sgd_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_sgd_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    double learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html b/gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html new file mode 100644 index 000000000..006855563 --- /dev/null +++ b/gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html @@ -0,0 +1,1326 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_sgd_split_weighted_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_sgd_split_weighted_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_weighted_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_weighted_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html b/gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html new file mode 100644 index 000000000..15171fac9 --- /dev/null +++ b/gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html @@ -0,0 +1,1126 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_sgd_split_weighted_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_sgd_split_weighted_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_weighted_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_weighted_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__sgd__split__weighted__vbe__cuda_8cu.html b/gen__embedding__backward__sgd__split__weighted__vbe__cuda_8cu.html new file mode 100644 index 000000000..a4f147187 --- /dev/null +++ b/gen__embedding__backward__sgd__split__weighted__vbe__cuda_8cu.html @@ -0,0 +1,292 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_sgd_split_weighted_vbe_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_sgd_split_weighted_vbe_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_OPTIMAL_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_KERNEL( MAX_D,
                    ... )
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_sgd_weighted_exact_vbe_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_sgd_weighted_exact_vbe_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t unused_,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const Tensor & B_offsets,
                    const Tensor & vbe_row_output_offsets,
                    const Tensor & vbe_b_t_map,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    double learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html b/gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html new file mode 100644 index 000000000..241517d96 --- /dev/null +++ b/gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html @@ -0,0 +1,1406 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_weighted_vbe_kernel_cta_per_row_1< at template __global__ __launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_weighted_vbe_kernel_cta_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter,
                    const int32_t max_segment_length_per_cta,
                    const bool use_deterministic_algorithms,
                    float learning_rate )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html b/gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html new file mode 100644 index 000000000..4a419ee29 --- /dev/null +++ b/gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html @@ -0,0 +1,1206 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kBackwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kBackwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_weighted_vbe_kernel_warp_per_row_1< at template __global__ __launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_weighted_vbe_kernel_warp_per_row_1< at template __global__ kWarpSize (const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/8]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ kWarpSize (const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations,
                    const bool use_uniq_cache_locations,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets,
                    const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    int32_t max_segment_length_per_warp,
                    bool stochastic_rounding,
                    at::PhiloxCudaState stochastic_rounding_philox_args,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    float learning_rate )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__adagrad_8cpp.html b/gen__embedding__backward__split__adagrad_8cpp.html new file mode 100644 index 000000000..f1d020aca --- /dev/null +++ b/gen__embedding__backward__split__adagrad_8cpp.html @@ -0,0 +1,889 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_adagrad.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_adagrad.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_adagrad_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_adagrad_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_backward_codegen_adagrad_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_adagrad_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_weighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_weighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_grad_indice_weights_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_grad_indice_weights_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const Tensor & feature_requires_grad )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_backward_codegen_adagrad_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_adagrad_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fb ,
                    m  )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__adagrad__cpu_8cpp.html b/gen__embedding__backward__split__adagrad__cpu_8cpp.html new file mode 100644 index 000000000..294847e5c --- /dev/null +++ b/gen__embedding__backward__split__adagrad__cpu_8cpp.html @@ -0,0 +1,215 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_adagrad_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_adagrad_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_adagrad_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    void split_embedding_backward_codegen_adagrad_cpu (Tensor grad_output,
                    Tensor host_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor D_offsets,
                    int64_t max_D,
                    Tensor hash_size_cumsum,
                    int64_t total_hash_size_bits,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    Tensor indice_weights,
                    bool stochastic_rounding,
                    Tensor momentum1_host,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    int64_t output_dtype = static_cast< int64_t >(SparseType::FP32) )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__adam_8cpp.html b/gen__embedding__backward__split__adam_8cpp.html new file mode 100644 index 000000000..9fd0ab8c0 --- /dev/null +++ b/gen__embedding__backward__split__adam_8cpp.html @@ -0,0 +1,1009 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_adam.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_adam.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_adam_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_adam_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate = 0,
                    double eps = 0,
                    double beta1 = 0,
                    double beta2 = 0,
                    double weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_backward_codegen_adam_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_adam_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate = 0,
                    double eps = 0,
                    double beta1 = 0,
                    double beta2 = 0,
                    double weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_weighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_weighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_grad_indice_weights_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_grad_indice_weights_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const Tensor & feature_requires_grad )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_backward_codegen_adam_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_adam_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate = 0,
                    double eps = 0,
                    double beta1 = 0,
                    double beta2 = 0,
                    double weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fb ,
                    m  )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__adam__cpu_8cpp.html b/gen__embedding__backward__split__adam__cpu_8cpp.html new file mode 100644 index 000000000..3700d2bb6 --- /dev/null +++ b/gen__embedding__backward__split__adam__cpu_8cpp.html @@ -0,0 +1,108 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_adam_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_adam_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__approx__rowwise__adagrad_8cpp.html b/gen__embedding__backward__split__approx__rowwise__adagrad_8cpp.html new file mode 100644 index 000000000..b1194c620 --- /dev/null +++ b/gen__embedding__backward__split__approx__rowwise__adagrad_8cpp.html @@ -0,0 +1,152 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_approx_rowwise_adagrad.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_approx_rowwise_adagrad.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fb ,
                    m  )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__approx__rowwise__adagrad__cpu_8cpp.html b/gen__embedding__backward__split__approx__rowwise__adagrad__cpu_8cpp.html new file mode 100644 index 000000000..f502509a8 --- /dev/null +++ b/gen__embedding__backward__split__approx__rowwise__adagrad__cpu_8cpp.html @@ -0,0 +1,108 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_approx_rowwise_adagrad_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_approx_rowwise_adagrad_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__approx__rowwise__adagrad__with__counter_8cpp.html b/gen__embedding__backward__split__approx__rowwise__adagrad__with__counter_8cpp.html new file mode 100644 index 000000000..94efd4eca --- /dev/null +++ b/gen__embedding__backward__split__approx__rowwise__adagrad__with__counter_8cpp.html @@ -0,0 +1,152 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_approx_rowwise_adagrad_with_counter.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_approx_rowwise_adagrad_with_counter.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fb ,
                    m  )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__approx__rowwise__adagrad__with__counter__cpu_8cpp.html b/gen__embedding__backward__split__approx__rowwise__adagrad__with__counter__cpu_8cpp.html new file mode 100644 index 000000000..5495137b6 --- /dev/null +++ b/gen__embedding__backward__split__approx__rowwise__adagrad__with__counter__cpu_8cpp.html @@ -0,0 +1,108 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_approx_rowwise_adagrad_with_counter_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_approx_rowwise_adagrad_with_counter_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html b/gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html new file mode 100644 index 000000000..b02dce432 --- /dev/null +++ b/gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html @@ -0,0 +1,919 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_approx_rowwise_adagrad_with_weight_decay.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_approx_rowwise_adagrad_with_weight_decay.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t weight_decay_mode = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t weight_decay_mode = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_weighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_weighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_grad_indice_weights_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_grad_indice_weights_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const Tensor & feature_requires_grad )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t weight_decay_mode = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fb ,
                    m  )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay__cpu_8cpp.html b/gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay__cpu_8cpp.html new file mode 100644 index 000000000..4711b7106 --- /dev/null +++ b/gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay__cpu_8cpp.html @@ -0,0 +1,108 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_approx_rowwise_adagrad_with_weight_decay_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_approx_rowwise_adagrad_with_weight_decay_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__approx__sgd_8cpp.html b/gen__embedding__backward__split__approx__sgd_8cpp.html new file mode 100644 index 000000000..12a05ed04 --- /dev/null +++ b/gen__embedding__backward__split__approx__sgd_8cpp.html @@ -0,0 +1,152 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_approx_sgd.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_approx_sgd.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fb ,
                    m  )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__approx__sgd__cpu_8cpp.html b/gen__embedding__backward__split__approx__sgd__cpu_8cpp.html new file mode 100644 index 000000000..e2c052fbe --- /dev/null +++ b/gen__embedding__backward__split__approx__sgd__cpu_8cpp.html @@ -0,0 +1,108 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_approx_sgd_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_approx_sgd_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__grad_8cu.html b/gen__embedding__backward__split__grad_8cu.html new file mode 100644 index 000000000..0c88cb9ea --- /dev/null +++ b/gen__embedding__backward__split__grad_8cu.html @@ -0,0 +1,291 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_grad.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_grad.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/4]

                    + +
                    +
                    +
                    +template<typename grad_t >
                    + + + + + + + +
                    template __global__ __launch_bounds__(kMaxThreads) void grad_mean_vbe_kernel< at template __global__ __launch_bounds__(kMaxThreads) void grad_mean_vbe_kernel< float >(pta __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/4]

                    + +
                    +
                    +
                    +template<typename grad_t >
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [3/4]

                    + +
                    +
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [4/4]

                    + +
                    +
                    +
                    +template<typename info_pta_t , typename info_t , bool nobag>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ false()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ false (const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > dev_or_uvm_unique_indices,
                    const int info_B_num_bits )
                    +
                    + +
                    +
                    + +

                    ◆ true()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __global__ true (const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > dev_or_uvm_unique_indices,
                    const int info_B_num_bits )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ int64_t

                    + +
                    +
                    + + + + +
                    template __global__ int64_t
                    +
                    + +
                    +
                    + +

                    ◆ uint32_t

                    + +
                    +
                    + + + + +
                    template __global__ uint32_t
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__indice__weights__codegen__cuda_8cu.html b/gen__embedding__backward__split__indice__weights__codegen__cuda_8cu.html new file mode 100644 index 000000000..fee7d36fa --- /dev/null +++ b/gen__embedding__backward__split__indice__weights__codegen__cuda_8cu.html @@ -0,0 +1,307 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_indice_weights_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_indice_weights_codegen_cuda.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__()

                    + +
                    +
                    +
                    +template<typename emb_t , typename grad_t , typename cache_t , size_t kMaxVecsPerThread>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_grad_indice_weights_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_grad_indice_weights_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const Tensor & feature_requires_grad )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_grad_indice_weights_vbe_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_grad_indice_weights_vbe_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const Tensor & feature_requires_grad,
                    const Tensor & vbe_row_output_offsets,
                    const Tensor & vbe_b_t_map,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64 )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__lamb_8cpp.html b/gen__embedding__backward__split__lamb_8cpp.html new file mode 100644 index 000000000..4b67da080 --- /dev/null +++ b/gen__embedding__backward__split__lamb_8cpp.html @@ -0,0 +1,1009 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_lamb.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_lamb.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_lamb_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_lamb_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate = 0,
                    double eps = 0,
                    double beta1 = 0,
                    double beta2 = 0,
                    double weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_backward_codegen_lamb_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_lamb_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate = 0,
                    double eps = 0,
                    double beta1 = 0,
                    double beta2 = 0,
                    double weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_weighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_weighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_grad_indice_weights_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_grad_indice_weights_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const Tensor & feature_requires_grad )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_backward_codegen_lamb_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_lamb_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate = 0,
                    double eps = 0,
                    double beta1 = 0,
                    double beta2 = 0,
                    double weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fb ,
                    m  )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__lamb__cpu_8cpp.html b/gen__embedding__backward__split__lamb__cpu_8cpp.html new file mode 100644 index 000000000..499320c0d --- /dev/null +++ b/gen__embedding__backward__split__lamb__cpu_8cpp.html @@ -0,0 +1,108 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_lamb_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_lamb_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__lars__sgd_8cpp.html b/gen__embedding__backward__split__lars__sgd_8cpp.html new file mode 100644 index 000000000..42b5e38be --- /dev/null +++ b/gen__embedding__backward__split__lars__sgd_8cpp.html @@ -0,0 +1,919 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_lars_sgd.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_lars_sgd.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_lars_sgd_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_lars_sgd_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double learning_rate = 0,
                    double eta = 0,
                    double momentum = 0,
                    double weight_decay = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_backward_codegen_lars_sgd_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_lars_sgd_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double learning_rate = 0,
                    double eta = 0,
                    double momentum = 0,
                    double weight_decay = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_weighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_weighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_grad_indice_weights_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_grad_indice_weights_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const Tensor & feature_requires_grad )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_backward_codegen_lars_sgd_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_lars_sgd_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double learning_rate = 0,
                    double eta = 0,
                    double momentum = 0,
                    double weight_decay = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fb ,
                    m  )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__lars__sgd__cpu_8cpp.html b/gen__embedding__backward__split__lars__sgd__cpu_8cpp.html new file mode 100644 index 000000000..af399a615 --- /dev/null +++ b/gen__embedding__backward__split__lars__sgd__cpu_8cpp.html @@ -0,0 +1,108 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_lars_sgd_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_lars_sgd_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__none_8cpp.html b/gen__embedding__backward__split__none_8cpp.html new file mode 100644 index 000000000..55baa9179 --- /dev/null +++ b/gen__embedding__backward__split__none_8cpp.html @@ -0,0 +1,814 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_none.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_none.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_none_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_none_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    int64_t total_hash_size = 0,
                    int64_t total_unique_indices = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_backward_codegen_none_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_none_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    int64_t total_hash_size = 0,
                    int64_t total_unique_indices = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_weighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_weighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_grad_indice_weights_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_grad_indice_weights_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const Tensor & feature_requires_grad )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_backward_codegen_none_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_none_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    int64_t total_hash_size = 0,
                    int64_t total_unique_indices = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fb ,
                    m  )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__none__cpu_8cpp.html b/gen__embedding__backward__split__none__cpu_8cpp.html new file mode 100644 index 000000000..00c6b83df --- /dev/null +++ b/gen__embedding__backward__split__none__cpu_8cpp.html @@ -0,0 +1,108 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_none_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_none_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__partial__rowwise__adam_8cpp.html b/gen__embedding__backward__split__partial__rowwise__adam_8cpp.html new file mode 100644 index 000000000..87b95470a --- /dev/null +++ b/gen__embedding__backward__split__partial__rowwise__adam_8cpp.html @@ -0,0 +1,1009 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_partial_rowwise_adam.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_partial_rowwise_adam.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate = 0,
                    double eps = 0,
                    double beta1 = 0,
                    double beta2 = 0,
                    double weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_backward_codegen_partial_rowwise_adam_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_partial_rowwise_adam_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate = 0,
                    double eps = 0,
                    double beta1 = 0,
                    double beta2 = 0,
                    double weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_weighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_weighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_grad_indice_weights_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_grad_indice_weights_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const Tensor & feature_requires_grad )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate = 0,
                    double eps = 0,
                    double beta1 = 0,
                    double beta2 = 0,
                    double weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fb ,
                    m  )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__partial__rowwise__adam__cpu_8cpp.html b/gen__embedding__backward__split__partial__rowwise__adam__cpu_8cpp.html new file mode 100644 index 000000000..a3d567acf --- /dev/null +++ b/gen__embedding__backward__split__partial__rowwise__adam__cpu_8cpp.html @@ -0,0 +1,108 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_partial_rowwise_adam_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_partial_rowwise_adam_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html b/gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html new file mode 100644 index 000000000..ddd1b3613 --- /dev/null +++ b/gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html @@ -0,0 +1,1009 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_partial_rowwise_lamb.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_partial_rowwise_lamb.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate = 0,
                    double eps = 0,
                    double beta1 = 0,
                    double beta2 = 0,
                    double weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_backward_codegen_partial_rowwise_lamb_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_partial_rowwise_lamb_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate = 0,
                    double eps = 0,
                    double beta1 = 0,
                    double beta2 = 0,
                    double weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_weighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_weighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_grad_indice_weights_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_grad_indice_weights_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const Tensor & feature_requires_grad )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate = 0,
                    double eps = 0,
                    double beta1 = 0,
                    double beta2 = 0,
                    double weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fb ,
                    m  )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__partial__rowwise__lamb__cpu_8cpp.html b/gen__embedding__backward__split__partial__rowwise__lamb__cpu_8cpp.html new file mode 100644 index 000000000..f1c4af1b9 --- /dev/null +++ b/gen__embedding__backward__split__partial__rowwise__lamb__cpu_8cpp.html @@ -0,0 +1,108 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_partial_rowwise_lamb_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_partial_rowwise_lamb_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__rowwise__adagrad_8cpp.html b/gen__embedding__backward__split__rowwise__adagrad_8cpp.html new file mode 100644 index 000000000..6da0c1709 --- /dev/null +++ b/gen__embedding__backward__split__rowwise__adagrad_8cpp.html @@ -0,0 +1,1599 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_rowwise_adagrad.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_rowwise_adagrad.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t weight_decay_mode = 0,
                    double max_norm = 0.0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_vbe_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_vbe_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const Tensor & B_offsets,
                    const Tensor & vbe_row_output_offsets,
                    const Tensor & vbe_b_t_map,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t weight_decay_mode = 0,
                    double max_norm = 0.0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t weight_decay_mode = 0,
                    double max_norm = 0.0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_vbe_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_vbe_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const Tensor & B_offsets,
                    const Tensor & vbe_row_output_offsets,
                    const Tensor & vbe_b_t_map,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t weight_decay_mode = 0,
                    double max_norm = 0.0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_unweighted_vbe_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_unweighted_vbe_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const Tensor & vbe_row_output_offsets,
                    const Tensor & vbe_b_t_map,
                    const int64_t vbe_output_size,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_weighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_weighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_weighted_vbe_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_weighted_vbe_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const Tensor & vbe_row_output_offsets,
                    const Tensor & vbe_b_t_map,
                    const int64_t vbe_output_size,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_grad_indice_weights_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_grad_indice_weights_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const Tensor & feature_requires_grad )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_grad_indice_weights_vbe_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_grad_indice_weights_vbe_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const Tensor & feature_requires_grad,
                    const Tensor & vbe_row_output_offsets,
                    const Tensor & vbe_b_t_map,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t weight_decay_mode = 0,
                    double max_norm = 0.0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fb ,
                    m  )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__rowwise__adagrad__cpu_8cpp.html b/gen__embedding__backward__split__rowwise__adagrad__cpu_8cpp.html new file mode 100644 index 000000000..0dde9fdf1 --- /dev/null +++ b/gen__embedding__backward__split__rowwise__adagrad__cpu_8cpp.html @@ -0,0 +1,230 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_rowwise_adagrad_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_rowwise_adagrad_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    void split_embedding_backward_codegen_rowwise_adagrad_cpu (Tensor grad_output,
                    Tensor host_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor D_offsets,
                    int64_t max_D,
                    Tensor hash_size_cumsum,
                    int64_t total_hash_size_bits,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    Tensor indice_weights,
                    bool stochastic_rounding,
                    Tensor momentum1_host,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t weight_decay_mode = 0,
                    double max_norm = 0.0,
                    int64_t output_dtype = static_cast< int64_t >(SparseType::FP32) )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html b/gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html new file mode 100644 index 000000000..e39e00097 --- /dev/null +++ b/gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html @@ -0,0 +1,1219 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_rowwise_adagrad_with_counter.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_rowwise_adagrad_with_counter.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor prev_iter_dev,
                    Tensor prev_iter_uvm,
                    Tensor prev_iter_placements,
                    Tensor prev_iter_offsets,
                    Tensor row_counter_dev,
                    Tensor row_counter_uvm,
                    Tensor row_counter_placements,
                    Tensor row_counter_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t iter = 0,
                    int64_t counter_halflife = -1,
                    int64_t adjustment_iter = -1,
                    double adjustment_ub = 1.0,
                    int64_t learning_rate_mode = -1,
                    int64_t weight_decay_mode = 1,
                    int64_t grad_sum_decay = -1,
                    double max_counter = 0,
                    double tail_id_threshold = 0.0,
                    int64_t is_tail_id_thresh_ratio = 0,
                    int64_t regularization_mode = 0,
                    double weight_norm_coefficient = 0.0,
                    double lower_bound = 0.0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor prev_iter_dev,
                    Tensor prev_iter_uvm,
                    Tensor prev_iter_placements,
                    Tensor prev_iter_offsets,
                    Tensor row_counter_dev,
                    Tensor row_counter_uvm,
                    Tensor row_counter_placements,
                    Tensor row_counter_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t iter = 0,
                    int64_t counter_halflife = -1,
                    int64_t adjustment_iter = -1,
                    double adjustment_ub = 1.0,
                    int64_t learning_rate_mode = -1,
                    int64_t weight_decay_mode = 1,
                    int64_t grad_sum_decay = -1,
                    double max_counter = 0,
                    double tail_id_threshold = 0.0,
                    int64_t is_tail_id_thresh_ratio = 0,
                    int64_t regularization_mode = 0,
                    double weight_norm_coefficient = 0.0,
                    double lower_bound = 0.0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_weighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_weighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_grad_indice_weights_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_grad_indice_weights_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const Tensor & feature_requires_grad )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor prev_iter_dev,
                    Tensor prev_iter_uvm,
                    Tensor prev_iter_placements,
                    Tensor prev_iter_offsets,
                    Tensor row_counter_dev,
                    Tensor row_counter_uvm,
                    Tensor row_counter_placements,
                    Tensor row_counter_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t iter = 0,
                    int64_t counter_halflife = -1,
                    int64_t adjustment_iter = -1,
                    double adjustment_ub = 1.0,
                    int64_t learning_rate_mode = -1,
                    int64_t weight_decay_mode = 1,
                    int64_t grad_sum_decay = -1,
                    double max_counter = 0,
                    double tail_id_threshold = 0.0,
                    int64_t is_tail_id_thresh_ratio = 0,
                    int64_t regularization_mode = 0,
                    double weight_norm_coefficient = 0.0,
                    double lower_bound = 0.0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fb ,
                    m  )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__rowwise__adagrad__with__counter__cpu_8cpp.html b/gen__embedding__backward__split__rowwise__adagrad__with__counter__cpu_8cpp.html new file mode 100644 index 000000000..28ec12387 --- /dev/null +++ b/gen__embedding__backward__split__rowwise__adagrad__with__counter__cpu_8cpp.html @@ -0,0 +1,315 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_rowwise_adagrad_with_counter_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_rowwise_adagrad_with_counter_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_with_counter_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    void split_embedding_backward_codegen_rowwise_adagrad_with_counter_cpu (Tensor grad_output,
                    Tensor host_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor D_offsets,
                    int64_t max_D,
                    Tensor hash_size_cumsum,
                    int64_t total_hash_size_bits,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    Tensor indice_weights,
                    bool stochastic_rounding,
                    Tensor momentum1_host,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor prev_iter_host,
                    Tensor prev_iter_placements,
                    Tensor prev_iter_offsets,
                    Tensor row_counter_host,
                    Tensor row_counter_placements,
                    Tensor row_counter_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t iter = 0,
                    int64_t counter_halflife = -1,
                    int64_t adjustment_iter = -1,
                    double adjustment_ub = 1.0,
                    int64_t learning_rate_mode = -1,
                    int64_t weight_decay_mode = 1,
                    int64_t grad_sum_decay = -1,
                    double max_counter = 0,
                    double tail_id_threshold = 0.0,
                    int64_t is_tail_id_thresh_ratio = 0,
                    int64_t regularization_mode = 0,
                    double weight_norm_coefficient = 0.0,
                    double lower_bound = 0.0,
                    int64_t output_dtype = static_cast< int64_t >(SparseType::FP32) )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html b/gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html new file mode 100644 index 000000000..10d5f289a --- /dev/null +++ b/gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html @@ -0,0 +1,919 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_rowwise_adagrad_with_weight_decay.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_rowwise_adagrad_with_weight_decay.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t weight_decay_mode = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t weight_decay_mode = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_weighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_weighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_grad_indice_weights_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_grad_indice_weights_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const Tensor & feature_requires_grad )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t weight_decay_mode = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fb ,
                    m  )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__rowwise__adagrad__with__weight__decay__cpu_8cpp.html b/gen__embedding__backward__split__rowwise__adagrad__with__weight__decay__cpu_8cpp.html new file mode 100644 index 000000000..3493e73d6 --- /dev/null +++ b/gen__embedding__backward__split__rowwise__adagrad__with__weight__decay__cpu_8cpp.html @@ -0,0 +1,108 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_rowwise_adagrad_with_weight_decay_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_rowwise_adagrad_with_weight_decay_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html b/gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html new file mode 100644 index 000000000..43ee48ad8 --- /dev/null +++ b/gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html @@ -0,0 +1,919 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_rowwise_weighted_adagrad.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_rowwise_weighted_adagrad.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_weighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_weighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_grad_indice_weights_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_grad_indice_weights_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const Tensor & feature_requires_grad )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fb ,
                    m  )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__rowwise__weighted__adagrad__cpu_8cpp.html b/gen__embedding__backward__split__rowwise__weighted__adagrad__cpu_8cpp.html new file mode 100644 index 000000000..414f5facd --- /dev/null +++ b/gen__embedding__backward__split__rowwise__weighted__adagrad__cpu_8cpp.html @@ -0,0 +1,225 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_rowwise_weighted_adagrad_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_rowwise_weighted_adagrad_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_rowwise_weighted_adagrad_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    void split_embedding_backward_codegen_rowwise_weighted_adagrad_cpu (Tensor grad_output,
                    Tensor host_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor D_offsets,
                    int64_t max_D,
                    Tensor hash_size_cumsum,
                    int64_t total_hash_size_bits,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    Tensor indice_weights,
                    bool stochastic_rounding,
                    Tensor momentum1_host,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0,
                    int64_t iter = 0,
                    int64_t output_dtype = static_cast< int64_t >(SparseType::FP32) )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__sgd_8cpp.html b/gen__embedding__backward__split__sgd_8cpp.html new file mode 100644 index 000000000..71854cd92 --- /dev/null +++ b/gen__embedding__backward__split__sgd_8cpp.html @@ -0,0 +1,1399 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_sgd.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_sgd.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_sgd_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_sgd_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    double learning_rate = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_backward_codegen_sgd_unweighted_exact_vbe_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_sgd_unweighted_exact_vbe_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const Tensor & B_offsets,
                    const Tensor & vbe_row_output_offsets,
                    const Tensor & vbe_b_t_map,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    double learning_rate = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_backward_codegen_sgd_weighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_sgd_weighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    double learning_rate = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_backward_codegen_sgd_weighted_exact_vbe_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_backward_codegen_sgd_weighted_exact_vbe_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const Tensor & B_offsets,
                    const Tensor & vbe_row_output_offsets,
                    const Tensor & vbe_b_t_map,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    double learning_rate = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_unweighted_vbe_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_unweighted_vbe_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const Tensor & vbe_row_output_offsets,
                    const Tensor & vbe_b_t_map,
                    const int64_t vbe_output_size,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_weighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_weighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_weighted_vbe_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_weighted_vbe_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const Tensor & vbe_row_output_offsets,
                    const Tensor & vbe_b_t_map,
                    const int64_t vbe_output_size,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_grad_indice_weights_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_grad_indice_weights_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const Tensor & feature_requires_grad )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_grad_indice_weights_vbe_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_grad_indice_weights_vbe_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const Tensor & feature_requires_grad,
                    const Tensor & vbe_row_output_offsets,
                    const Tensor & vbe_b_t_map,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_backward_codegen_sgd_unweighted_exact_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_backward_codegen_sgd_unweighted_exact_cuda (const Tensor & grad_output,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t BT_block_size,
                    const int64_t max_segment_length_per_warp,
                    const bool stochastic_rounding,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool use_uniq_cache_locations,
                    const bool use_homogeneous_placements,
                    double learning_rate = 0 )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [1/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fb ,
                    m  )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__backward__split__sgd__cpu_8cpp.html b/gen__embedding__backward__split__sgd__cpu_8cpp.html new file mode 100644 index 000000000..0790fca63 --- /dev/null +++ b/gen__embedding__backward__split__sgd__cpu_8cpp.html @@ -0,0 +1,195 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_backward_split_sgd_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_backward_split_sgd_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "codegen/embedding_forward_split_cpu.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_backward_codegen_sgd_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    void split_embedding_backward_codegen_sgd_cpu (Tensor grad_output,
                    Tensor host_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor D_offsets,
                    int64_t max_D,
                    Tensor hash_size_cumsum,
                    int64_t total_hash_size_bits,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    Tensor indice_weights,
                    bool stochastic_rounding,
                    double learning_rate = 0,
                    int64_t output_dtype = static_cast< int64_t >(SparseType::FP32) )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__dense__unweighted__codegen__cuda_8cu.html b/gen__embedding__forward__dense__unweighted__codegen__cuda_8cu.html new file mode 100644 index 000000000..c48d7f78a --- /dev/null +++ b/gen__embedding__forward__dense__unweighted__codegen__cuda_8cu.html @@ -0,0 +1,319 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_dense_unweighted_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_dense_unweighted_codegen_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_KERNEL_FOR_CACHE_CASE

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_KERNEL_FOR_CACHE_CASE( CACHE_CASE_,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    return __VA_ARGS__(); \
                    +
                    }()
                    +
                    +
                    +
                    + +

                    ◆ DISPATCH_OPTIMAL_FORWARD_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_FORWARD_KERNEL( MAX_D_,
                    ... )
                    +
                    + +
                    +
                    + +

                    ◆ DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL( DD_,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    if (DD_ <= 4) { \
                    +
                    constexpr int kEmbeddingSize = 4; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 8) { \
                    +
                    constexpr int kEmbeddingSize = 8; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 16) { \
                    +
                    constexpr int kEmbeddingSize = 16; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 32) { \
                    +
                    constexpr int kEmbeddingSize = 32; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    return; \
                    +
                    }()
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ dense_embedding_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor dense_embedding_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ dense_embedding_nobag_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor dense_embedding_nobag_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__dense__unweighted__codegen__meta_8cpp.html b/gen__embedding__forward__dense__unweighted__codegen__meta_8cpp.html new file mode 100644 index 000000000..e827fbfe7 --- /dev/null +++ b/gen__embedding__forward__dense__unweighted__codegen__meta_8cpp.html @@ -0,0 +1,234 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_dense_unweighted_codegen_meta.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_dense_unweighted_codegen_meta.cpp File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ dense_embedding_codegen_forward_unweighted_meta()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor dense_embedding_codegen_forward_unweighted_meta (const Tensor & dev_weights,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ dense_embedding_nobag_codegen_forward_unweighted_meta()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor dense_embedding_nobag_codegen_forward_unweighted_meta (const Tensor & dev_weights,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__dense__unweighted__kernel_8cu.html b/gen__embedding__forward__dense__unweighted__kernel_8cu.html new file mode 100644 index 000000000..4ba2e419d --- /dev/null +++ b/gen__embedding__forward__dense__unweighted__kernel_8cu.html @@ -0,0 +1,489 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_dense_unweighted_kernel.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_dense_unweighted_kernel.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , typename index_t , size_t kMaxVecsPerThread, size_t kThreadGroupSize>
                    + + + + + + + +
                    __launch_bounds__ (kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __launch_bounds__ (kForwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template float
                    +
                    + +
                    +
                    + +

                    ◆ int64_t

                    + +
                    +
                    + + + + +
                    template int64_t
                    +
                    + +
                    +
                    + +

                    ◆ uint8_t

                    + +
                    +
                    + + + + +
                    template uint8_t
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html b/gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html new file mode 100644 index 000000000..d51d4f9fd --- /dev/null +++ b/gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html @@ -0,0 +1,459 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_dense_unweighted_nobag_kernel.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_dense_unweighted_nobag_kernel.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , typename index_t , size_t kThreadGroupSize>
                    + + + + + + + +
                    __launch_bounds__ (kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __launch_bounds__ (kForwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template float
                    +
                    + +
                    +
                    + +

                    ◆ int64_t

                    + +
                    +
                    + + + + +
                    template int64_t
                    +
                    + +
                    +
                    + +

                    ◆ uint8_t

                    + +
                    +
                    + + + + +
                    template uint8_t
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html b/gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html new file mode 100644 index 000000000..64a1fcf43 --- /dev/null +++ b/gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html @@ -0,0 +1,281 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , typename index_t , size_t kThreadGroupSize>
                    + + + + + + + +
                    __launch_bounds__ (kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __launch_bounds__ (kForwardMaxThreads )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ D

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > int64_t D
                    +
                    + +
                    +
                    + +

                    ◆ dev_weights

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> dev_weights
                    +
                    + +
                    +
                    + +

                    ◆ fd_B

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> int64_t FixedDivisor fd_B
                    +
                    + +
                    +
                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template float
                    +
                    + +
                    +
                    + +

                    ◆ indices

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> int64_t FixedDivisor const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> indices
                    +
                    + +
                    +
                    + +

                    ◆ int64_t

                    + +
                    +
                    + + + + +
                    template int64_t
                    +
                    + +
                    +
                    + +

                    ◆ offsets

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > int64_t FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets
                    +
                    + +
                    +
                    + +

                    ◆ output

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> int64_t FixedDivisor const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> pta::PackedTensorAccessor64<float, 2, at::RestrictPtrTraits> output
                    +
                    + +
                    +
                    + +

                    ◆ uint8_t

                    + +
                    +
                    + + + + +
                    template uint8_t
                    +
                    + +
                    +
                    + +

                    ◆ weights_offsets

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> weights_offsets
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__dense__weighted__codegen__cuda_8cu.html b/gen__embedding__forward__dense__weighted__codegen__cuda_8cu.html new file mode 100644 index 000000000..09e64245f --- /dev/null +++ b/gen__embedding__forward__dense__weighted__codegen__cuda_8cu.html @@ -0,0 +1,278 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_dense_weighted_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_dense_weighted_codegen_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_KERNEL_FOR_CACHE_CASE

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_KERNEL_FOR_CACHE_CASE( CACHE_CASE_,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    return __VA_ARGS__(); \
                    +
                    }()
                    +
                    +
                    +
                    + +

                    ◆ DISPATCH_OPTIMAL_FORWARD_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_FORWARD_KERNEL( MAX_D_,
                    ... )
                    +
                    + +
                    +
                    + +

                    ◆ DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL( DD_,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    if (DD_ <= 4) { \
                    +
                    constexpr int kEmbeddingSize = 4; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 8) { \
                    +
                    constexpr int kEmbeddingSize = 8; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 16) { \
                    +
                    constexpr int kEmbeddingSize = 16; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 32) { \
                    +
                    constexpr int kEmbeddingSize = 32; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    return; \
                    +
                    }()
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ dense_embedding_codegen_forward_weighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor dense_embedding_codegen_forward_weighted_cuda (const Tensor & dev_weights,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__dense__weighted__codegen__meta_8cpp.html b/gen__embedding__forward__dense__weighted__codegen__meta_8cpp.html new file mode 100644 index 000000000..710f79b8b --- /dev/null +++ b/gen__embedding__forward__dense__weighted__codegen__meta_8cpp.html @@ -0,0 +1,193 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_dense_weighted_codegen_meta.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_dense_weighted_codegen_meta.cpp File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ dense_embedding_codegen_forward_weighted_meta()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor dense_embedding_codegen_forward_weighted_meta (const Tensor & dev_weights,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__dense__weighted__kernel_8cu.html b/gen__embedding__forward__dense__weighted__kernel_8cu.html new file mode 100644 index 000000000..ead2f9f58 --- /dev/null +++ b/gen__embedding__forward__dense__weighted__kernel_8cu.html @@ -0,0 +1,855 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_dense_weighted_kernel.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_dense_weighted_kernel.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , typename index_t , size_t kMaxVecsPerThread, size_t kThreadGroupSize>
                    + + + + + + + +
                    __launch_bounds__ (kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __launch_bounds__ (kForwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [9/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [10/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [11/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [12/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template float
                    +
                    + +
                    +
                    + +

                    ◆ int64_t

                    + +
                    +
                    + + + + +
                    template int64_t
                    +
                    + +
                    +
                    + +

                    ◆ uint8_t

                    + +
                    +
                    + + + + +
                    template uint8_t
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html new file mode 100644 index 000000000..ee14e0668 --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html @@ -0,0 +1,648 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Macro Definition Documentation

                    + +

                    ◆ X [1/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::INT2_split_embedding_codegen_forward_unweighted_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, weights_tys, uint8_t, 1, 32), \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, D_offsets, int32_t, 1, 32), \
                    +
                    \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, offsets, index_t, 1, 32), \
                    +
                    \
                    +
                    pooling_mode, \
                    +
                    \
                    +
                    row_alignment, \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    template uint8_t
                    Definition gen_batch_index_select_dim0_forward_kernel.cu:1240
                    +
                    template int64_t
                    Definition gen_batch_index_select_dim0_forward_kernel.cu:1241
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights
                    Definition gen_batch_index_select_dim0_forward_kernel_small.cu:119
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets
                    Definition gen_batch_index_select_dim0_forward_kernel_small.cu:120
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets
                    Definition gen_batch_index_select_dim0_forward_kernel_small.cu:121
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const lxu_cache_weights
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:58
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const uvm_weights
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:57
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const const int32_t *__restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t *__restrict__ const const index_t *__restrict__ const const uint32_t *__restrict__ const const int64_t *__restrict__ const const int32_t *__restrict__ const lxu_cache_locations
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:69
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const const int32_t *__restrict__ const weights_placements
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:59
                    +
                    indices_is_long &[is_long_idx] is_long_mask int32_t
                    Definition input_combine.cu:73
                    +
                    __host__ DEVICE_INLINE int32_t div_round_up(int32_t a, int32_t b)
                    Definition fbgemm_cuda_utils.cuh:1460
                    +
                    __global__ const int32_t const int32_t T
                    Definition sparse_batched_unary_embeddings.cu:21
                    +
                    __global__ const int32_t const int32_t const scalar_t *__restrict__ const index_t *__restrict__ const index_t *__restrict__ const index_t *__restrict__ indices
                    Definition sparse_batched_unary_embeddings.cu:26
                    +
                    __global__ const int32_t const int32_t const scalar_t *__restrict__ const index_t *__restrict__ const index_t *__restrict__ offsets
                    Definition sparse_batched_unary_embeddings.cu:25
                    +
                    __global__ const int32_t B
                    Definition sparse_batched_unary_embeddings.cu:20
                    +
                    __global__ const int32_t const int32_t const scalar_t *__restrict__ const index_t *__restrict__ const index_t *__restrict__ const index_t *__restrict__ scalar_t *__restrict__ output
                    Definition sparse_batched_unary_embeddings.cu:28
                    +
                    C10_HOST_DEVICE C10_ALWAYS_INLINE uint32_t div_round_up(uint32_t a, uint32_t b)
                    Definition embedding_common.h:94
                    +
                    +
                    +
                    + +

                    ◆ X [2/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::INT4_split_embedding_codegen_forward_unweighted_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, weights_tys, uint8_t, 1, 32), \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, D_offsets, int32_t, 1, 32), \
                    +
                    \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, offsets, index_t, 1, 32), \
                    +
                    \
                    +
                    pooling_mode, \
                    +
                    \
                    +
                    row_alignment, \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ X [3/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::INT8_split_embedding_codegen_forward_unweighted_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, weights_tys, uint8_t, 1, 32), \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, D_offsets, int32_t, 1, 32), \
                    +
                    \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, offsets, index_t, 1, 32), \
                    +
                    \
                    +
                    pooling_mode, \
                    +
                    \
                    +
                    row_alignment, \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ X [4/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::FP8_split_embedding_codegen_forward_unweighted_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, weights_tys, uint8_t, 1, 32), \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, D_offsets, int32_t, 1, 32), \
                    +
                    \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, offsets, index_t, 1, 32), \
                    +
                    \
                    +
                    pooling_mode, \
                    +
                    \
                    +
                    row_alignment, \
                    +
                    \
                    +
                    fp8_exponent_bits, \
                    +
                    fp8_exponent_bias, \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ X [5/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::FP16_split_embedding_codegen_forward_unweighted_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, weights_tys, uint8_t, 1, 32), \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, D_offsets, int32_t, 1, 32), \
                    +
                    \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, offsets, index_t, 1, 32), \
                    +
                    \
                    +
                    pooling_mode, \
                    +
                    \
                    +
                    row_alignment, \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ X [6/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::FP32_split_embedding_codegen_forward_unweighted_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, weights_tys, uint8_t, 1, 32), \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, D_offsets, int32_t, 1, 32), \
                    +
                    \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, offsets, index_t, 1, 32), \
                    +
                    \
                    +
                    pooling_mode, \
                    +
                    \
                    +
                    row_alignment, \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ Y

                    + +
                    +
                    + + + + + + + +
                    #define Y( ...)
                    +
                    +Value:
                    if (device_only) { \
                    +
                    X(true, __VA_ARGS__) \
                    +
                    } else { \
                    +
                    X(false, __VA_ARGS__) \
                    +
                    };
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ int_nbit_split_embedding_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor int_nbit_split_embedding_codegen_forward_unweighted_cuda (Tensor dev_weights,
                    Tensor uvm_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor weights_tys,
                    Tensor D_offsets,
                    const int64_t total_D,
                    const int64_t max_int2_D,
                    const int64_t max_int4_D,
                    const int64_t max_int8_D,
                    const int64_t max_float16_D,
                    const int64_t max_float32_D,
                    Tensor indices,
                    Tensor offsets,
                    const int64_t pooling_mode,
                    const int64_t row_alignment,
                    const int64_t output_dtype,
                    Tensor lxu_cache_weights,
                    Tensor lxu_cache_locations,
                    const int64_t max_float8_D,
                    const int64_t fp8_exponent_bits,
                    const int64_t fp8_exponent_bias )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html new file mode 100644 index 000000000..b31e971d7 --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html @@ -0,0 +1,626 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Macro Definition Documentation

                    + +

                    ◆ X [1/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::INT2_split_embedding_nobag_codegen_forward_unweighted_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, weights_tys, uint8_t, 1, 32), \
                    +
                    \
                    +
                    D, \
                    +
                    \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, offsets, index_t, 1, 32), \
                    +
                    \
                    +
                    row_alignment, \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    template uint8_t
                    Definition gen_batch_index_select_dim0_forward_kernel.cu:1240
                    +
                    template int64_t
                    Definition gen_batch_index_select_dim0_forward_kernel.cu:1241
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights
                    Definition gen_batch_index_select_dim0_forward_kernel_small.cu:119
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets
                    Definition gen_batch_index_select_dim0_forward_kernel_small.cu:120
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > int64_t D
                    Definition gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu:101
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const lxu_cache_weights
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:58
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const uvm_weights
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:57
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const const int32_t *__restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t *__restrict__ const const index_t *__restrict__ const const uint32_t *__restrict__ const const int64_t *__restrict__ const const int32_t *__restrict__ const lxu_cache_locations
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:69
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const const int32_t *__restrict__ const weights_placements
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:59
                    +
                    indices_is_long &[is_long_idx] is_long_mask int32_t
                    Definition input_combine.cu:73
                    +
                    __host__ DEVICE_INLINE int32_t div_round_up(int32_t a, int32_t b)
                    Definition fbgemm_cuda_utils.cuh:1460
                    +
                    __global__ const int32_t const int32_t T
                    Definition sparse_batched_unary_embeddings.cu:21
                    +
                    __global__ const int32_t const int32_t const scalar_t *__restrict__ const index_t *__restrict__ const index_t *__restrict__ const index_t *__restrict__ indices
                    Definition sparse_batched_unary_embeddings.cu:26
                    +
                    __global__ const int32_t const int32_t const scalar_t *__restrict__ const index_t *__restrict__ const index_t *__restrict__ offsets
                    Definition sparse_batched_unary_embeddings.cu:25
                    +
                    __global__ const int32_t B
                    Definition sparse_batched_unary_embeddings.cu:20
                    +
                    __global__ const int32_t const int32_t const scalar_t *__restrict__ const index_t *__restrict__ const index_t *__restrict__ const index_t *__restrict__ scalar_t *__restrict__ output
                    Definition sparse_batched_unary_embeddings.cu:28
                    +
                    C10_HOST_DEVICE C10_ALWAYS_INLINE uint32_t div_round_up(uint32_t a, uint32_t b)
                    Definition embedding_common.h:94
                    +
                    +
                    +
                    + +

                    ◆ X [2/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::INT4_split_embedding_nobag_codegen_forward_unweighted_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, weights_tys, uint8_t, 1, 32), \
                    +
                    \
                    +
                    D, \
                    +
                    \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, offsets, index_t, 1, 32), \
                    +
                    \
                    +
                    row_alignment, \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ X [3/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::INT8_split_embedding_nobag_codegen_forward_unweighted_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, weights_tys, uint8_t, 1, 32), \
                    +
                    \
                    +
                    D, \
                    +
                    \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, offsets, index_t, 1, 32), \
                    +
                    \
                    +
                    row_alignment, \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ X [4/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::FP8_split_embedding_nobag_codegen_forward_unweighted_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, weights_tys, uint8_t, 1, 32), \
                    +
                    \
                    +
                    D, \
                    +
                    \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, offsets, index_t, 1, 32), \
                    +
                    \
                    +
                    row_alignment, \
                    +
                    \
                    +
                    fp8_exponent_bits, \
                    +
                    fp8_exponent_bias, \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ X [5/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::FP16_split_embedding_nobag_codegen_forward_unweighted_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, weights_tys, uint8_t, 1, 32), \
                    +
                    \
                    +
                    D, \
                    +
                    \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, offsets, index_t, 1, 32), \
                    +
                    \
                    +
                    row_alignment, \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ X [6/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::FP32_split_embedding_nobag_codegen_forward_unweighted_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, weights_tys, uint8_t, 1, 32), \
                    +
                    \
                    +
                    D, \
                    +
                    \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, offsets, index_t, 1, 32), \
                    +
                    \
                    +
                    row_alignment, \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ Y

                    + +
                    +
                    + + + + + + + +
                    #define Y( ...)
                    +
                    +Value:
                    if (device_only) { \
                    +
                    X(true, __VA_ARGS__) \
                    +
                    } else { \
                    +
                    X(false, __VA_ARGS__) \
                    +
                    };
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ int_nbit_split_embedding_nobag_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor int_nbit_split_embedding_nobag_codegen_forward_unweighted_cuda (Tensor dev_weights,
                    Tensor uvm_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor weights_tys,
                    const int64_t D,
                    const int64_t max_int2_D,
                    const int64_t max_int4_D,
                    const int64_t max_int8_D,
                    const int64_t max_float16_D,
                    const int64_t max_float32_D,
                    Tensor indices,
                    Tensor offsets,
                    const int64_t row_alignment,
                    const int64_t output_dtype,
                    Tensor lxu_cache_weights,
                    Tensor lxu_cache_locations,
                    const int64_t max_float8_D,
                    const int64_t fp8_exponent_bits,
                    const int64_t fp8_exponent_bias )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html new file mode 100644 index 000000000..440fd23e4 --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html @@ -0,0 +1,653 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Macro Definition Documentation

                    + +

                    ◆ X [1/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::INT2_split_embedding_codegen_forward_weighted_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, weights_tys, uint8_t, 1, 32), \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, D_offsets, int32_t, 1, 32), \
                    +
                    \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, offsets, index_t, 1, 32), \
                    +
                    \
                    +
                    pooling_mode, \
                    +
                    \
                    +
                    row_alignment, \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, indice_weights, float, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name1, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    template uint8_t
                    Definition gen_batch_index_select_dim0_forward_kernel.cu:1240
                    +
                    template int64_t
                    Definition gen_batch_index_select_dim0_forward_kernel.cu:1241
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights
                    Definition gen_batch_index_select_dim0_forward_kernel_small.cu:119
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets
                    Definition gen_batch_index_select_dim0_forward_kernel_small.cu:120
                    +
                    template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets
                    Definition gen_batch_index_select_dim0_forward_kernel_small.cu:121
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const lxu_cache_weights
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:58
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const uvm_weights
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:57
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const const int32_t *__restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t *__restrict__ const const index_t *__restrict__ const const uint32_t *__restrict__ const const int64_t *__restrict__ const const int32_t *__restrict__ const lxu_cache_locations
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:69
                    +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const const int32_t *__restrict__ const weights_placements
                    Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:59
                    +
                    indices_is_long &[is_long_idx] is_long_mask int32_t
                    Definition input_combine.cu:73
                    +
                    __host__ DEVICE_INLINE int32_t div_round_up(int32_t a, int32_t b)
                    Definition fbgemm_cuda_utils.cuh:1460
                    +
                    __global__ const int32_t const int32_t T
                    Definition sparse_batched_unary_embeddings.cu:21
                    +
                    __global__ const int32_t const int32_t const scalar_t *__restrict__ const index_t *__restrict__ const index_t *__restrict__ const index_t *__restrict__ indices
                    Definition sparse_batched_unary_embeddings.cu:26
                    +
                    __global__ const int32_t const int32_t const scalar_t *__restrict__ const index_t *__restrict__ const index_t *__restrict__ offsets
                    Definition sparse_batched_unary_embeddings.cu:25
                    +
                    __global__ const int32_t B
                    Definition sparse_batched_unary_embeddings.cu:20
                    +
                    __global__ const int32_t const int32_t const scalar_t *__restrict__ const index_t *__restrict__ const index_t *__restrict__ const index_t *__restrict__ scalar_t *__restrict__ output
                    Definition sparse_batched_unary_embeddings.cu:28
                    +
                    C10_HOST_DEVICE C10_ALWAYS_INLINE uint32_t div_round_up(uint32_t a, uint32_t b)
                    Definition embedding_common.h:94
                    +
                    +
                    +
                    + +

                    ◆ X [2/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::INT4_split_embedding_codegen_forward_weighted_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, weights_tys, uint8_t, 1, 32), \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, D_offsets, int32_t, 1, 32), \
                    +
                    \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, offsets, index_t, 1, 32), \
                    +
                    \
                    +
                    pooling_mode, \
                    +
                    \
                    +
                    row_alignment, \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, indice_weights, float, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name2, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ X [3/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::INT8_split_embedding_codegen_forward_weighted_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, weights_tys, uint8_t, 1, 32), \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, D_offsets, int32_t, 1, 32), \
                    +
                    \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, offsets, index_t, 1, 32), \
                    +
                    \
                    +
                    pooling_mode, \
                    +
                    \
                    +
                    row_alignment, \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, indice_weights, float, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name3, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ X [4/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::FP8_split_embedding_codegen_forward_weighted_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, weights_tys, uint8_t, 1, 32), \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, D_offsets, int32_t, 1, 32), \
                    +
                    \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, offsets, index_t, 1, 32), \
                    +
                    \
                    +
                    pooling_mode, \
                    +
                    \
                    +
                    row_alignment, \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, indice_weights, float, 1, 32), \
                    +
                    fp8_exponent_bits, \
                    +
                    fp8_exponent_bias, \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name4, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ X [5/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::FP16_split_embedding_codegen_forward_weighted_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, weights_tys, uint8_t, 1, 32), \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, D_offsets, int32_t, 1, 32), \
                    +
                    \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, offsets, index_t, 1, 32), \
                    +
                    \
                    +
                    pooling_mode, \
                    +
                    \
                    +
                    row_alignment, \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, indice_weights, float, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name5, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ X [6/6]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    #define X( DeviceOnly,
                    OutputRowsPerThread,
                    InputRowsInFlight,
                    MinNum128BRows,
                    MaxNum128BRows )
                    +
                    +Value:
                    nbit::FP32_split_embedding_codegen_forward_weighted_kernel_small_L<index_t, output_t, OutputRowsPerThread, kWarpsPerBlock, InputRowsInFlight, MinNum128BRows, MaxNum128BRows, DeviceOnly><<< \
                    +
                    nbit::div_round_up(T * nbit::div_round_up(B, OutputRowsPerThread), kWarpsPerBlock), \
                    +
                    dim3(kWarpSize, kWarpsPerBlock), \
                    +
                    0, \
                    +
                    at::cuda::getCurrentCUDAStream()>>>( \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, dev_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, uvm_weights, uint8_t, 1, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, weights_placements, int32_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, weights_offsets, int64_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, weights_tys, uint8_t, 1, 32), \
                    +
                    \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, D_offsets, int32_t, 1, 32), \
                    +
                    \
                    +
                    FixedDivisor(div_round_up(B, OutputRowsPerThread)), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, indices, index_t, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, offsets, index_t, 1, 32), \
                    +
                    \
                    +
                    pooling_mode, \
                    +
                    \
                    +
                    row_alignment, \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, indice_weights, float, 1, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, output, output_t, 2, 32), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, lxu_cache_weights, uint8_t, 2, 64), \
                    +
                    MAKE_PTA_WITH_NAME(func_name6, lxu_cache_locations, int32_t, 1, 32) \
                    +
                    ); \
                    +
                    C10_CUDA_KERNEL_LAUNCH_CHECK(); \
                    +
                    +
                    +
                    + +

                    ◆ Y

                    + +
                    +
                    + + + + + + + +
                    #define Y( ...)
                    +
                    +Value:
                    if (device_only) { \
                    +
                    X(true, __VA_ARGS__) \
                    +
                    } else { \
                    +
                    X(false, __VA_ARGS__) \
                    +
                    };
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ int_nbit_split_embedding_codegen_forward_weighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor int_nbit_split_embedding_codegen_forward_weighted_cuda (Tensor dev_weights,
                    Tensor uvm_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor weights_tys,
                    Tensor D_offsets,
                    const int64_t total_D,
                    const int64_t max_int2_D,
                    const int64_t max_int4_D,
                    const int64_t max_int8_D,
                    const int64_t max_float16_D,
                    const int64_t max_float32_D,
                    Tensor indices,
                    Tensor offsets,
                    const int64_t pooling_mode,
                    const int64_t row_alignment,
                    Tensor indice_weights,
                    const int64_t output_dtype,
                    Tensor lxu_cache_weights,
                    Tensor lxu_cache_locations,
                    const int64_t max_float8_D,
                    const int64_t fp8_exponent_bits,
                    const int64_t fp8_exponent_bias )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__fp16__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__fp16__codegen__cuda_8cu.html new file mode 100644 index 000000000..11e6121ec --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__fp16__codegen__cuda_8cu.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_kernel_unweighted_fp16_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_kernel_unweighted_fp16_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__fp32__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__fp32__codegen__cuda_8cu.html new file mode 100644 index 000000000..6855f5186 --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__fp32__codegen__cuda_8cu.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_kernel_unweighted_fp32_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_kernel_unweighted_fp32_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__fp8__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__fp8__codegen__cuda_8cu.html new file mode 100644 index 000000000..0b91c2e18 --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__fp8__codegen__cuda_8cu.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_kernel_unweighted_fp8_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_kernel_unweighted_fp8_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__int2__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__int2__codegen__cuda_8cu.html new file mode 100644 index 000000000..98529bc4d --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__int2__codegen__cuda_8cu.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_kernel_unweighted_int2_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_kernel_unweighted_int2_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__int4__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__int4__codegen__cuda_8cu.html new file mode 100644 index 000000000..9842befa9 --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__int4__codegen__cuda_8cu.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_kernel_unweighted_int4_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_kernel_unweighted_int4_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__int8__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__int8__codegen__cuda_8cu.html new file mode 100644 index 000000000..bbb165d52 --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__int8__codegen__cuda_8cu.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_kernel_unweighted_int8_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_kernel_unweighted_int8_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__fp16__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__fp16__codegen__cuda_8cu.html new file mode 100644 index 000000000..4a068bf37 --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__fp16__codegen__cuda_8cu.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_fp16_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_fp16_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__fp32__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__fp32__codegen__cuda_8cu.html new file mode 100644 index 000000000..02e9e3f1e --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__fp32__codegen__cuda_8cu.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_fp32_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_fp32_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__fp8__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__fp8__codegen__cuda_8cu.html new file mode 100644 index 000000000..0a367b90d --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__fp8__codegen__cuda_8cu.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_fp8_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_fp8_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__int2__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__int2__codegen__cuda_8cu.html new file mode 100644 index 000000000..841149cae --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__int2__codegen__cuda_8cu.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_int2_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_int2_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__int4__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__int4__codegen__cuda_8cu.html new file mode 100644 index 000000000..cc37d4507 --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__int4__codegen__cuda_8cu.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_int4_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_int4_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__int8__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__int8__codegen__cuda_8cu.html new file mode 100644 index 000000000..89b3ebaa4 --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__int8__codegen__cuda_8cu.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_int8_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_int8_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__kernel__weighted__fp16__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__kernel__weighted__fp16__codegen__cuda_8cu.html new file mode 100644 index 000000000..f9351a858 --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__kernel__weighted__fp16__codegen__cuda_8cu.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_kernel_weighted_fp16_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_kernel_weighted_fp16_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__kernel__weighted__fp32__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__kernel__weighted__fp32__codegen__cuda_8cu.html new file mode 100644 index 000000000..b7c043047 --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__kernel__weighted__fp32__codegen__cuda_8cu.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_kernel_weighted_fp32_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_kernel_weighted_fp32_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__kernel__weighted__fp8__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__kernel__weighted__fp8__codegen__cuda_8cu.html new file mode 100644 index 000000000..a0d8c4249 --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__kernel__weighted__fp8__codegen__cuda_8cu.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_kernel_weighted_fp8_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_kernel_weighted_fp8_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__kernel__weighted__int2__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__kernel__weighted__int2__codegen__cuda_8cu.html new file mode 100644 index 000000000..374aceddf --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__kernel__weighted__int2__codegen__cuda_8cu.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_kernel_weighted_int2_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_kernel_weighted_int2_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__kernel__weighted__int4__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__kernel__weighted__int4__codegen__cuda_8cu.html new file mode 100644 index 000000000..c14c44c06 --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__kernel__weighted__int4__codegen__cuda_8cu.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_kernel_weighted_int4_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_kernel_weighted_int4_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__split__nbit__kernel__weighted__int8__codegen__cuda_8cu.html b/gen__embedding__forward__quantized__split__nbit__kernel__weighted__int8__codegen__cuda_8cu.html new file mode 100644 index 000000000..2ae1430d3 --- /dev/null +++ b/gen__embedding__forward__quantized__split__nbit__kernel__weighted__int8__codegen__cuda_8cu.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_split_nbit_kernel_weighted_int8_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_quantized_split_nbit_kernel_weighted_int8_codegen_cuda.cu File Reference
                    +
                    +
                    + + + + +

                    +Namespaces

                    namespace  nbit
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__unweighted__codegen__cpu_8cpp.html b/gen__embedding__forward__quantized__unweighted__codegen__cpu_8cpp.html new file mode 100644 index 000000000..c2c0b2ee6 --- /dev/null +++ b/gen__embedding__forward__quantized__unweighted__codegen__cpu_8cpp.html @@ -0,0 +1,254 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/Context.h>
                    +#include <ATen/Parallel.h>
                    +#include "fbgemm_gpu/cpu_utils.h"
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm/FbgemmEmbedding.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include <cstring>
                    +

                    Function Documentation

                    + +

                    ◆ int_nbit_split_embedding_codegen_forward_unweighted_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor int_nbit_split_embedding_codegen_forward_unweighted_cpu (Tensor dev_weights,
                    Tensor uvm_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor weights_tys,
                    Tensor D_offsets,
                    int64_t total_D,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    int64_t row_alignment,
                    int64_t output_dtype,
                    int64_t fp8_exponent_bits,
                    int64_t fp8_exponent_bias )
                    +
                    + +
                    +
                    + +

                    ◆ int_nbit_split_embedding_nobag_codegen_forward_unweighted_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor int_nbit_split_embedding_nobag_codegen_forward_unweighted_cpu (Tensor dev_weights,
                    Tensor uvm_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor weights_tys,
                    const int64_t D,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    int64_t row_alignment,
                    int64_t output_dtype,
                    int64_t fp8_exponent_bits,
                    int64_t fp8_exponent_bias )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__quantized__weighted__codegen__cpu_8cpp.html b/gen__embedding__forward__quantized__weighted__codegen__cpu_8cpp.html new file mode 100644 index 000000000..84e5028ee --- /dev/null +++ b/gen__embedding__forward__quantized__weighted__codegen__cpu_8cpp.html @@ -0,0 +1,250 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_quantized_weighted_codegen_cpu.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_quantized_weighted_codegen_cpu.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/Context.h>
                    +#include <ATen/Parallel.h>
                    +#include "fbgemm_gpu/cpu_utils.h"
                    +#include "fbgemm_gpu/dispatch_macros.h"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +#include "fbgemm/FbgemmEmbedding.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include <cstring>
                    +

                    Function Documentation

                    + +

                    ◆ int_nbit_split_embedding_codegen_forward_weighted_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor int_nbit_split_embedding_codegen_forward_weighted_cpu (Tensor dev_weights,
                    Tensor uvm_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor weights_tys,
                    Tensor D_offsets,
                    int64_t total_D,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    int64_t row_alignment,
                    Tensor indice_weights,
                    int64_t output_dtype,
                    int64_t fp8_exponent_bits,
                    int64_t fp8_exponent_bias )
                    +
                    + +
                    +
                    + +

                    ◆ pruned_hashmap_insert_weighted_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    void pruned_hashmap_insert_weighted_cpu (Tensor indices,
                    Tensor dense_indices,
                    Tensor offsets,
                    Tensor hash_table,
                    Tensor hash_table_offsets )
                    +
                    + +
                    +
                    + +

                    ◆ pruned_hashmap_lookup_weighted_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + +
                    Tensor pruned_hashmap_lookup_weighted_cpu (Tensor indices,
                    Tensor offsets,
                    Tensor hash_table,
                    Tensor hash_table_offsets )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html b/gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html new file mode 100644 index 000000000..0fbdc7eb0 --- /dev/null +++ b/gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html @@ -0,0 +1,588 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_split_unweighted_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_split_unweighted_codegen_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_KERNEL_FOR_CACHE_CASE

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_KERNEL_FOR_CACHE_CASE( CACHE_CASE_,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    if (CACHE_CASE_ == false) { \
                    +
                    constexpr auto _TUseCache = false; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (CACHE_CASE_ == true) { \
                    +
                    constexpr auto _TUseCache = true; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    return; \
                    +
                    }()
                    +
                    +
                    +
                    + +

                    ◆ DISPATCH_OPTIMAL_FORWARD_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_FORWARD_KERNEL( MAX_D_,
                    ... )
                    +
                    + +
                    +
                    + +

                    ◆ DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL( DD_,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    if (DD_ <= 4) { \
                    +
                    constexpr int kEmbeddingSize = 4; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 8) { \
                    +
                    constexpr int kEmbeddingSize = 8; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 16) { \
                    +
                    constexpr int kEmbeddingSize = 16; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 32) { \
                    +
                    constexpr int kEmbeddingSize = 32; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    return; \
                    +
                    }()
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , typename index_t , bool use_lxu_cache>
                    + + + + + + + + + + + +
                    template __launch_bounds__ (kForwardMaxThreads ,
                    2048/ kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_codegen_forward_unweighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_codegen_forward_unweighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ B

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t B
                    +
                    + +
                    +
                    + +

                    ◆ D_offsets

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const D_offsets
                    +
                    + +
                    +
                    + +

                    ◆ fd_num_warps_per_table

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor fd_num_warps_per_table
                    +
                    + +
                    +
                    + +

                    ◆ indices

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const indices
                    +
                    + +
                    +
                    + +

                    ◆ lxu_cache_locations

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > int64_t FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations
                    +
                    + +
                    +
                    + +

                    ◆ lxu_cache_weights

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights
                    +
                    + +
                    +
                    + +

                    ◆ max_D_cache

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t max_D_cache
                    +
                    + +
                    +
                    + +

                    ◆ mean_pooling

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool mean_pooling
                    +
                    + +
                    +
                    + +

                    ◆ offsets

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const index_t* __restrict__ const offsets
                    +
                    + +
                    +
                    + +

                    ◆ output

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const const int64_t* __restrict__ const const int32_t* __restrict__ const output_t* __restrict__ const output
                    +
                    + +
                    +
                    + +

                    ◆ T

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t T
                    +
                    + +
                    +
                    + +

                    ◆ uvm_weights

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights
                    +
                    + +
                    +
                    + +

                    ◆ weights_offsets

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const const int64_t* __restrict__ const weights_offsets
                    +
                    + +
                    +
                    + +

                    ◆ weights_placements

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__split__unweighted__codegen__meta_8cpp.html b/gen__embedding__forward__split__unweighted__codegen__meta_8cpp.html new file mode 100644 index 000000000..0f59ba9c8 --- /dev/null +++ b/gen__embedding__forward__split__unweighted__codegen__meta_8cpp.html @@ -0,0 +1,274 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_split_unweighted_codegen_meta.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_split_unweighted_codegen_meta.cpp File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_codegen_forward_unweighted_meta()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_unweighted_meta (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_nobag_codegen_forward_unweighted_meta()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_nobag_codegen_forward_unweighted_meta (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__split__unweighted__kernel_8cu.html b/gen__embedding__forward__split__unweighted__kernel_8cu.html new file mode 100644 index 000000000..7eeaa4072 --- /dev/null +++ b/gen__embedding__forward__split__unweighted__kernel_8cu.html @@ -0,0 +1,1063 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_split_unweighted_kernel.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_split_unweighted_kernel.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , bool use_lxu_cache, typename index_t , size_t kMaxVecsPerThread, size_t kThreadGroupSize>
                    + + + + + + + +
                    __launch_bounds__ (kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __launch_bounds__ (kForwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [9/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [10/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [11/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [12/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ false

                    + +
                    +
                    + + + + +
                    template false
                    +
                    + +
                    +
                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template float
                    +
                    + +
                    +
                    + +

                    ◆ int64_t

                    + +
                    +
                    + + + + +
                    template int64_t
                    +
                    + +
                    +
                    + +

                    ◆ true

                    + +
                    +
                    + + + + +
                    template true
                    +
                    + +
                    +
                    + +

                    ◆ uint8_t

                    + +
                    +
                    + + + + +
                    template uint8_t
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html b/gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html new file mode 100644 index 000000000..fb3979f62 --- /dev/null +++ b/gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html @@ -0,0 +1,1003 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_split_unweighted_nobag_kernel.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_split_unweighted_nobag_kernel.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , bool use_lxu_cache, typename index_t , size_t kThreadGroupSize>
                    + + + + + + + +
                    __launch_bounds__ (kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __launch_bounds__ (kForwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [9/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [10/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [11/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [12/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    int64_t D,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ false

                    + +
                    +
                    + + + + +
                    template false
                    +
                    + +
                    +
                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template float
                    +
                    + +
                    +
                    + +

                    ◆ int64_t

                    + +
                    +
                    + + + + +
                    template int64_t
                    +
                    + +
                    +
                    + +

                    ◆ true

                    + +
                    +
                    + + + + +
                    template true
                    +
                    + +
                    +
                    + +

                    ◆ uint8_t

                    + +
                    +
                    + + + + +
                    template uint8_t
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html b/gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html new file mode 100644 index 000000000..93eedfc1d --- /dev/null +++ b/gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html @@ -0,0 +1,337 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_split_unweighted_nobag_kernel_small.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_split_unweighted_nobag_kernel_small.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , typename index_t , size_t kThreadGroupSize>
                    + + + + + + + +
                    __launch_bounds__ (kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __launch_bounds__ (kForwardMaxThreads )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ D

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor64<at::Half, 2, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> int64_t D
                    +
                    + +
                    +
                    + +

                    ◆ dev_weights

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> dev_weights
                    +
                    + +
                    +
                    + +

                    ◆ fd_B

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor64<at::Half, 2, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> int64_t FixedDivisor fd_B
                    +
                    + +
                    +
                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template float
                    +
                    + +
                    +
                    + +

                    ◆ indices

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor64<at::Half, 2, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> int64_t FixedDivisor const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> indices
                    +
                    + +
                    +
                    + +

                    ◆ int64_t

                    + +
                    +
                    + + + + +
                    template int64_t
                    +
                    + +
                    +
                    + +

                    ◆ lxu_cache_locations

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor64<at::Half, 2, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> int64_t FixedDivisor const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> lxu_cache_locations
                    +
                    + +
                    +
                    + +

                    ◆ lxu_cache_weights

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor64<at::Half, 2, at::RestrictPtrTraits> lxu_cache_weights
                    +
                    + +
                    +
                    + +

                    ◆ offsets

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor64<at::Half, 2, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> int64_t FixedDivisor const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> offsets
                    +
                    + +
                    +
                    + +

                    ◆ output

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor64<at::Half, 2, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> int64_t FixedDivisor const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> pta::PackedTensorAccessor64<float, 2, at::RestrictPtrTraits> output
                    +
                    + +
                    +
                    + +

                    ◆ uint8_t

                    + +
                    +
                    + + + + +
                    template uint8_t
                    +
                    + +
                    +
                    + +

                    ◆ uvm_weights

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> uvm_weights
                    +
                    + +
                    +
                    + +

                    ◆ weights_offsets

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor64<at::Half, 2, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> weights_offsets
                    +
                    + +
                    +
                    + +

                    ◆ weights_placements

                    + +
                    +
                    + + + + +
                    template const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> const pta::PackedTensorAccessor64<at::Half, 2, at::RestrictPtrTraits> const pta::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> weights_placements
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__split__unweighted__v2__kernel_8cu.html b/gen__embedding__forward__split__unweighted__v2__kernel_8cu.html new file mode 100644 index 000000000..9bd71c18d --- /dev/null +++ b/gen__embedding__forward__split__unweighted__v2__kernel_8cu.html @@ -0,0 +1,3605 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_split_unweighted_v2_kernel.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_split_unweighted_v2_kernel.cu File Reference
                    +
                    +
                    + + + + + + + + + + +

                    +Classes

                    struct  Vec4Type< T >
                     
                    struct  Vec4Type< float >
                     
                    struct  Vec4Type< at::Half >
                     
                    struct  Vec4Type< uint8_t >
                     
                    +

                    Macro Definition Documentation

                    + +

                    ◆ ACC_ADD_OR_FMA

                    + +
                    +
                    + + + + + + + + + + + +
                    #define ACC_ADD_OR_FMA( WEIGHT,
                    INDEX_WEIGHT )    accumulator.add(WEIGHT);
                    +
                    + +
                    +
                    + +

                    ◆ DIV_ROUND_UP

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DIV_ROUND_UP( numer,
                    denom )   ((numer + denom - 1) / denom)
                    +
                    + +
                    +
                    + +

                    ◆ INVOKE_PROCESS_ALL_INDICES

                    + +
                    +
                    + + + + + + + +
                    #define INVOKE_PROCESS_ALL_INDICES( ...)
                    +
                    +Value:
                    if (use_lxu_cache) { \
                    +
                    INVOKE_PROCESS_ALL_INDICES_HELPER(true, __VA_ARGS__); \
                    +
                    } \
                    +
                    else { \
                    +
                    INVOKE_PROCESS_ALL_INDICES_HELPER(false, __VA_ARGS__); \
                    +
                    }
                    +
                    bool use_lxu_cache
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:746
                    +
                    +
                    +
                    + +

                    ◆ INVOKE_PROCESS_ALL_INDICES_HELPER

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + +
                    #define INVOKE_PROCESS_ALL_INDICES_HELPER( USE_CACHE,
                    KERNEL_TYPE,
                    TAIL_WARP_SIZE,
                    STEP_MASK )
                    +
                    +Value:
                    process_all_indices_## KERNEL_TYPE< \
                    +
                    index_t, \
                    +
                    emb_t, \
                    +
                    emb_vec_t, \
                    +
                    cache_t, \
                    + + +
                    USE_CACHE, \
                    +
                    USE_CACHE && !std::is_same<emb_t, cache_t>::value, \
                    + +
                    STEP, \
                    +
                    STEP_MASK, \
                    +
                    TAIL_WARP_SIZE \
                    +
                    >( \
                    +
                    smem, \
                    +
                    L, \
                    +
                    load_d + (threadIdx.x % TAIL_WARP_SIZE) < load_D, \
                    + + + +
                    uint32_t load_D
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:724
                    +
                    const uint32_t params_offset
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:674
                    +
                    const emb_t *__restrict__ const const cache_t *__restrict__ const const int32_t *__restrict__ const const uint32_t const uint32_t const bool mean_pooling
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:656
                    +
                    vec4_type< output_t > output_vec_t
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:667
                    +
                    constexpr uint32_t NUM_PARAMS
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:671
                    +
                    const emb_t *__restrict__ const const cache_t *__restrict__ const const int32_t *__restrict__ const const uint32_t const uint32_t const bool const uint32_t max_D_cache
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:657
                    +
                    vec4_type< cache_t > cache_vec_t
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:666
                    +
                    uint32_t L
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:744
                    +
                    const uint32_t load_d
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:741
                    +
                    __shared__ long smem[NUM_PARAMS *NUM_WARPS+kForwardMaxThreads]
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:673
                    +
                    constexpr uint32_t STEP
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:672
                    +
                    constexpr uint32_t NUM_WARPS
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:669
                    +
                    +
                    +
                    + +

                    ◆ SMEM_CACHE_WEIGHT_DATA

                    + +
                    +
                    + + + + + + + + + + + +
                    #define SMEM_CACHE_WEIGHT_DATA( SMEM_IDX,
                    WEIGHT_IDX )    (SMEM_PTR_BASE(const cache_vec_t**)[SMEM_IDX])[WEIGHT_IDX]
                    +
                    + +
                    +
                    + +

                    ◆ SMEM_CACHE_WEIGHT_PTR

                    + +
                    +
                    + + + + +
                    #define SMEM_CACHE_WEIGHT_PTR   SMEM_PTR_BASE(const cache_vec_t**)
                    +
                    + +
                    +
                    + +

                    ◆ SMEM_EMB_WEIGHT_DATA

                    + +
                    +
                    + + + + + + + + + + + +
                    #define SMEM_EMB_WEIGHT_DATA( SMEM_IDX,
                    WEIGHT_IDX )    (SMEM_PTR_BASE(const emb_vec_t**)[SMEM_IDX])[WEIGHT_IDX]
                    +
                    + +
                    +
                    + +

                    ◆ SMEM_EMB_WEIGHT_PTR

                    + +
                    +
                    + + + + +
                    #define SMEM_EMB_WEIGHT_PTR   SMEM_PTR_BASE(const emb_vec_t**)
                    +
                    + +
                    +
                    + +

                    ◆ SMEM_GENERIC_PTR

                    + +
                    +
                    + + + + +
                    #define SMEM_GENERIC_PTR   SMEM_PTR_BASE(uintptr_t*)
                    +
                    + +
                    +
                    + +

                    ◆ SMEM_OFFSET

                    + +
                    +
                    + + + + +
                    #define SMEM_OFFSET    (IS_FULL_WARP ? j : ((threadIdx.x / LOAD_GROUP_SIZE) + (j * NUM_LOAD_GROUPS)))
                    +
                    + +
                    +
                    + +

                    ◆ SMEM_PTR_BASE

                    + +
                    +
                    + + + + + + + +
                    #define SMEM_PTR_BASE( TYPE)    (reinterpret_cast<TYPE>(smem + WEIGHT_PTR_OFFSET) + threadIdx.y * kWarpSize)
                    +
                    + +
                    +
                    + +

                    ◆ WEIGHT_OFFSET

                    + +
                    +
                    + + + + +
                    #define WEIGHT_OFFSET    (IS_FULL_WARP ? threadIdx.x : (threadIdx.x % LOAD_GROUP_SIZE))
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ cache_vec_t

                    + +
                    +
                    + + + + +
                    using cache_vec_t = vec4_type<cache_t>
                    +
                    + +
                    +
                    + +

                    ◆ output_vec_t

                    + +
                    +
                    + + + + +
                    using output_vec_t = vec4_type<output_t>
                    +
                    + +
                    +
                    + +

                    ◆ vec4_type

                    + +
                    +
                    +
                    +template<typename T >
                    + + + + +
                    using vec4_type = typename Vec4Type<T>::type
                    +
                    + +
                    +
                    +

                    Enumeration Type Documentation

                    + +

                    ◆ LXU_CACHE_PARAMS

                    + +
                    +
                    + + + + +
                    enum LXU_CACHE_PARAMS
                    +
                    + + + +
                    Enumerator
                    P_lxu_cache_weights 
                    P_lxu_cache_locations 
                    + +
                    +
                    + +

                    ◆ SAVED_PARAMS

                    + +
                    +
                    + + + + +
                    enum SAVED_PARAMS
                    +
                    + + + + + + + + +
                    Enumerator
                    P_indices 
                    P_weights 
                    P_outputs 
                    P_offsets 
                    P_num_offsets 
                    P_load_D 
                    P_total_load_D 
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , typename index_t , bool USE_LXU_CACHE>
                    + + + + + + + + + + + +
                    __launch_bounds__ (kForwardMaxThreads ,
                    2048/ kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    template __launch_bounds__ (kForwardMaxThreads ,
                    2048/ kForwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ DivMod()

                    + +
                    +
                    + + + + + + + + + + + + + + + + +
                    fd_num_warps_per_table DivMod (global_warp_id ,
                    & t,
                    & table_warp_id )
                    +
                    + +
                    +
                    + +

                    ◆ false() [1/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    at::Half *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [2/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    float *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [3/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    uint8_t *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [4/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    at::Half *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [5/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    float *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [6/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    uint8_t *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [7/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    at::Half *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [8/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    float *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [9/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    uint8_t *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [10/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    at::Half *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [11/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    float *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [12/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    uint8_t *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ get_next_bag_boundary_and_L()

                    + +
                    +
                    +
                    +template<uint32_t LOWER_BIT_CNT, uint32_t WARP_MASK>
                    + + + + + + + + + + + + + + + + +
                    __inline__ __device__ void get_next_bag_boundary_and_L (const uint32_t bag_boundary,
                    int32_t *const next_boundary,
                    uint32_t *const L )
                    +
                    + +
                    +
                    + +

                    ◆ if() [1/8]

                    + +
                    +
                    + + + + + + + +
                    if (is_small_L &&table_warp_id >=num_warps_for_small_L * 8)
                    +
                    + +
                    +
                    + +

                    ◆ if() [2/8]

                    + +
                    +
                    + + + + + + + +
                    if (is_small_L )
                    +
                    + +
                    +
                    + +

                    ◆ if() [3/8]

                    + +
                    +
                    + + + + + + + +
                    if (is_zero_total_L )
                    +
                    + +
                    +
                    + +

                    ◆ if() [4/8]

                    + +
                    +
                    + + + + + + + +
                    if (L<= 1)
                    +
                    + +
                    +
                    + +

                    ◆ if() [5/8]

                    + +
                    +
                    + + + + + + + +
                    if ()
                    +
                    + +
                    +
                    + +

                    ◆ if() [6/8]

                    + +
                    +
                    + + + + + + + +
                    if (t >= T)
                    +
                    + +
                    +
                    + +

                    ◆ if() [7/8]

                    + +
                    +
                    + + + + + + + +
                    if (table_warp_id >=num_warps_per_row * is_small_L ? num_warps_for_small_L :B)
                    +
                    + +
                    +
                    + +

                    ◆ if() [8/8]

                    + +
                    +
                    + + + + + + + +
                    if (threadIdx. x = = 0)
                    +
                    + +
                    +
                    + +

                    ◆ process_all_indices_large_Ls()

                    + +
                    +
                    +
                    +template<typename index_t , typename emb_t , typename emb_vec_t , typename cache_t , typename cache_vec_t , typename output_vec_t , bool USE_CACHE_WEIGHTS, bool USE_MIXED_TYPE_CACHE, uint32_t WEIGHT_PTR_OFFSET, uint32_t STEP, uint32_t STEP_MASK, uint32_t LOAD_GROUP_SIZE>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    __noinline__ __device__ void process_all_indices_large_Ls (long *const smem,
                    const uint32_t L,
                    const bool process_d,
                    const bool mean_pooling,
                    const uint32_t params_offset,
                    const uint32_t max_D_cache )
                    +
                    + +
                    +
                    + +

                    ◆ process_all_indices_no_pooling()

                    + +
                    +
                    +
                    +template<typename index_t , typename emb_t , typename emb_vec_t , typename output_vec_t , uint32_t STEP>
                    + + + + + + + + + + + + + + + + +
                    __inline__ __device__ void process_all_indices_no_pooling (long *const smem,
                    const bool process_d,
                    const uint32_t params_offset )
                    +
                    + +
                    +
                    + +

                    ◆ process_all_indices_small_Ls()

                    + +
                    +
                    +
                    +template<typename index_t , typename emb_t , typename emb_vec_t , typename cache_t , typename cache_vec_t , typename output_vec_t , bool USE_CACHE_WEIGHTS, bool USE_MIXED_TYPE_CACHE, uint32_t WEIGHT_PTR_OFFSET, uint32_t STEP, uint32_t STEP_MASK, uint32_t LOAD_GROUP_SIZE>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    __noinline__ __device__ void process_all_indices_small_Ls (long *const smem,
                    const uint32_t total_L,
                    const bool process_d,
                    const bool mean_pooling,
                    const uint32_t params_offset,
                    const uint32_t max_D_cache )
                    +
                    + +
                    +
                    + +

                    ◆ true() [1/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template true (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    at::Half *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [2/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template true (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    float *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [3/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template true (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    uint8_t *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [4/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template true (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    at::Half *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [5/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template true (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    float *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [6/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template true (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    uint8_t *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [7/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template true (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    at::Half *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [8/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template true (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    float *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [9/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template true (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    uint8_t *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [10/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_v2_kernel< at template __launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_v2_kernel< at template __launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_v2_kernel< at template __launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_v2_kernel< at template true (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    at::Half *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [11/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_v2_kernel< at template __launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_v2_kernel< at template __launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_v2_kernel< at template __launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_v2_kernel< at template true (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    float *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [12/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    true (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    uint8_t *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ write_loop_small_Ls()

                    + +
                    +
                    +
                    +template<typename emb_t , typename output_vec_t , uint32_t STEP, uint32_t BOUNDARY_IDX_BIT_CNT, uint32_t WARP_MASK>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    __inline__ __device__ void write_loop_small_Ls (long *const smem,
                    uint32_t *const write_idx,
                    uint32_t *const bag_boundary,
                    int32_t *const next_boundary,
                    uint32_t *const L,
                    Vec4StepT< STEP, emb_t > *const accumulator,
                    const uint32_t params_offset,
                    const uint32_t l,
                    const bool process_d,
                    const bool mean_pooling )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ B

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t B
                    +
                    + +
                    +
                    + +

                    ◆ b

                    + +
                    +
                    + +
                    +
                    + +

                    ◆ D_offsets

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const D_offsets
                    +
                    + +
                    +
                    + +

                    ◆ D_start

                    + +
                    +
                    + + + + +
                    uint32_t D_start
                    +
                    + +
                    +
                    + +

                    ◆ else

                    + +
                    +
                    + + + + +
                    else
                    +
                    +Initial value:
                    {
                    +
                    +
                    }
                    +
                    +
                    +
                    +
                    +
                    +
                    }
                    +
                    +
                    +
                    + +
                    __global__ void split_embedding_codegen_forward_unweighted_v2_kernel
                    +
                    <
                    + +
                    constexpr size_t kForwardMaxThreads
                    Definition embedding_forward_template_helpers.cuh:43
                    +
                    uint8_t
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:904
                    +
                    __launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_v2_kernel(const emb_t *__restrict__ const dev_weights
                    +
                    +
                    +
                    + +

                    ◆ fd_num_warps_per_table

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor fd_num_warps_per_table
                    +
                    + +
                    +
                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template float
                    +
                    + +
                    +
                    + +

                    ◆ global_warp_id

                    + +
                    +
                    + + + + +
                    const int32_t global_warp_id = blockIdx.x * blockDim.y + threadIdx.y
                    +
                    + +
                    +
                    + +

                    ◆ indices

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const indices
                    +
                    + +
                    +
                    + +

                    ◆ int64_t

                    + +
                    +
                    + + + + +
                    template int64_t
                    +
                    + +
                    +
                    + +

                    ◆ is_small_L

                    + +
                    +
                    + + + + +
                    const auto is_small_L = total_L <= (static_cast<index_t>(B) * 8)
                    +
                    + +
                    +
                    + +

                    ◆ is_zero_total_L

                    + +
                    +
                    + + + + +
                    const auto is_zero_total_L = total_L == 0
                    +
                    + +
                    +
                    + +

                    ◆ L

                    + +
                    +
                    + + + + +
                    L = shfl_sync(L, 0)
                    +
                    + +
                    +
                    + +

                    ◆ load_D

                    + +
                    +
                    + + + + +
                    load_D = shfl_sync(load_D, 0)
                    +
                    + +
                    +
                    + +

                    ◆ load_d

                    + +
                    +
                    + + + + +
                    const uint32_t load_d = (table_warp_id % num_warps_per_row) * kWarpSize
                    +
                    + +
                    +
                    + +

                    ◆ lxu_cache_locations

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const const int64_t* __restrict__ const const int32_t* __restrict__ const lxu_cache_locations
                    +
                    + +
                    +
                    + +

                    ◆ lxu_cache_weights

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const lxu_cache_weights
                    +
                    + +
                    +
                    + +

                    ◆ LXU_PARAMS_CNT

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr uint32_t LXU_PARAMS_CNT = 2
                    +
                    +constexpr
                    +
                    + +
                    +
                    + +

                    ◆ max_D_cache

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t max_D_cache
                    +
                    + +
                    +
                    + +

                    ◆ mean_pooling

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool mean_pooling
                    +
                    + +
                    +
                    + +

                    ◆ NUM_OFFSETS_PER_WARP

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr uint32_t NUM_OFFSETS_PER_WARP = kWarpSize
                    +
                    +constexpr
                    +
                    + +
                    +
                    + +

                    ◆ NUM_PARAMS

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr uint32_t NUM_PARAMS = SAVED_PARAMS_CNT + (USE_LXU_CACHE ? LXU_PARAMS_CNT : 0)
                    +
                    +constexpr
                    +
                    + +
                    +
                    + +

                    ◆ NUM_WARPS

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr uint32_t NUM_WARPS = kForwardMaxThreads / kWarpSize
                    +
                    +constexpr
                    +
                    + +
                    +
                    + +

                    ◆ num_warps_for_small_L

                    + +
                    +
                    + + + + +
                    const uint32_t num_warps_for_small_L = DIV_ROUND_UP(B, NUM_OFFSETS_PER_WARP)
                    +
                    + +
                    +
                    + +

                    ◆ num_warps_per_row

                    + +
                    +
                    + + + + +
                    const uint32_t num_warps_per_row = DIV_ROUND_UP(load_D, kWarpSize)
                    +
                    + +
                    +
                    + +

                    ◆ offsets

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const index_t* __restrict__ const offsets
                    +
                    + +
                    +
                    + +

                    ◆ output

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const const int64_t* __restrict__ const const int32_t* __restrict__ const output_t* __restrict__ const output
                    +
                    +Initial value:
                    {
                    +
                    using emb_vec_t = vec4_type<emb_t>
                    +
                    typename Vec4Type< T >::type vec4_type
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:69
                    +
                    +
                    +
                    + +

                    ◆ params_offset

                    + +
                    +
                    + + + + +
                    const uint32_t params_offset = NUM_PARAMS * threadIdx.y
                    +
                    + +
                    +
                    + +

                    ◆ row_start

                    + +
                    +
                    + + + + +
                    uint32_t row_start
                    +
                    + +
                    +
                    + +

                    ◆ SAVED_PARAMS_CNT

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr uint32_t SAVED_PARAMS_CNT = 7
                    +
                    +constexpr
                    +
                    + +
                    +
                    + +

                    ◆ smem

                    + +
                    +
                    + + + + +
                    __shared__ long smem[NUM_PARAMS *NUM_WARPS+kForwardMaxThreads]
                    +
                    + +
                    +
                    + +

                    ◆ STEP

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr uint32_t STEP = 4
                    +
                    +constexpr
                    +
                    + +
                    +
                    + +

                    ◆ T

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t T
                    +
                    + +
                    +
                    + +

                    ◆ t

                    + +
                    +
                    + + + + +
                    int32_t t
                    +
                    + +
                    +
                    + +

                    ◆ table_warp_id

                    + +
                    +
                    + + + + +
                    int32_t table_warp_id
                    +
                    + +
                    +
                    + +

                    ◆ total_L

                    + +
                    +
                    + + + + +
                    const auto total_L = offsets[(t + 1) * B] - offsets[t * B]
                    +
                    + +
                    +
                    + +

                    ◆ total_load_D

                    + +
                    +
                    + + + + +
                    uint32_t total_load_D
                    +
                    + +
                    +
                    + +

                    ◆ uint8_t

                    + +
                    +
                    + + + + +
                    template uint8_t
                    +
                    + +
                    +
                    + +

                    ◆ use_lxu_cache

                    + +
                    +
                    + + + + +
                    use_lxu_cache = USE_LXU_CACHE
                    +
                    + +
                    +
                    + +

                    ◆ uvm_weights

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const uvm_weights
                    +
                    + +
                    +
                    + +

                    ◆ VEC_WIDTH

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr uint32_t VEC_WIDTH = 4
                    +
                    +constexpr
                    +
                    + +
                    +
                    + +

                    ◆ weights_offsets

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const const int64_t* __restrict__ const weights_offsets
                    +
                    + +
                    +
                    + +

                    ◆ weights_placements

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const weights_placements
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html b/gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html new file mode 100644 index 000000000..43a2c367b --- /dev/null +++ b/gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html @@ -0,0 +1,546 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_KERNEL_FOR_CACHE_CASE

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_KERNEL_FOR_CACHE_CASE( CACHE_CASE_,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    if (CACHE_CASE_ == false) { \
                    +
                    constexpr auto _TUseCache = false; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (CACHE_CASE_ == true) { \
                    +
                    constexpr auto _TUseCache = true; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    return; \
                    +
                    }()
                    +
                    +
                    +
                    + +

                    ◆ DISPATCH_OPTIMAL_FORWARD_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_FORWARD_KERNEL( MAX_D_,
                    ... )
                    +
                    + +
                    +
                    + +

                    ◆ DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL( DD_,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    if (DD_ <= 4) { \
                    +
                    constexpr int kEmbeddingSize = 4; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 8) { \
                    +
                    constexpr int kEmbeddingSize = 8; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 16) { \
                    +
                    constexpr int kEmbeddingSize = 16; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 32) { \
                    +
                    constexpr int kEmbeddingSize = 32; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    return; \
                    +
                    }()
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , typename index_t , bool use_lxu_cache>
                    + + + + + + + + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta __launch_bounds__ (kForwardMaxThreads ,
                    2048/ kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_unweighted_vbe_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_unweighted_vbe_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const Tensor & vbe_row_output_offsets,
                    const Tensor & vbe_b_t_map,
                    const int64_t vbe_output_size,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ B

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t B
                    +
                    + +
                    +
                    + +

                    ◆ D_offsets

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const D_offsets
                    +
                    + +
                    +
                    + +

                    ◆ fd_num_warps_per_table

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor fd_num_warps_per_table
                    +
                    + +
                    +
                    + +

                    ◆ indices

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const indices
                    +
                    + +
                    +
                    + +

                    ◆ lxu_cache_locations

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const const int64_t* __restrict__ const const int32_t* __restrict__ const lxu_cache_locations
                    +
                    + +
                    +
                    + +

                    ◆ lxu_cache_weights

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const lxu_cache_weights
                    +
                    + +
                    +
                    + +

                    ◆ max_D_cache

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t max_D_cache
                    +
                    + +
                    +
                    + +

                    ◆ mean_pooling

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool mean_pooling
                    +
                    + +
                    +
                    + +

                    ◆ offsets

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const index_t* __restrict__ const offsets
                    +
                    + +
                    +
                    + +

                    ◆ output

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const const int64_t* __restrict__ const const int32_t* __restrict__ const output_t* __restrict__ const output
                    +
                    + +
                    +
                    + +

                    ◆ T

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t T
                    +
                    + +
                    +
                    + +

                    ◆ uvm_weights

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const uvm_weights
                    +
                    + +
                    +
                    + +

                    ◆ weights_offsets

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const const int64_t* __restrict__ const weights_offsets
                    +
                    + +
                    +
                    + +

                    ◆ weights_placements

                    + +
                    +
                    + + + + +
                    __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const weights_placements
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__split__unweighted__vbe__codegen__meta_8cpp.html b/gen__embedding__forward__split__unweighted__vbe__codegen__meta_8cpp.html new file mode 100644 index 000000000..345008c20 --- /dev/null +++ b/gen__embedding__forward__split__unweighted__vbe__codegen__meta_8cpp.html @@ -0,0 +1,233 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_split_unweighted_vbe_codegen_meta.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_split_unweighted_vbe_codegen_meta.cpp File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_codegen_forward_unweighted_vbe_meta()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_unweighted_vbe_meta (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const Tensor & vbe_row_output_offsets,
                    const Tensor & vbe_b_t_map,
                    const int64_t vbe_output_size,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html b/gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html new file mode 100644 index 000000000..8c11cb718 --- /dev/null +++ b/gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html @@ -0,0 +1,1243 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_split_unweighted_vbe_kernel.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_split_unweighted_vbe_kernel.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , bool use_lxu_cache, typename index_t , size_t kMaxVecsPerThread, size_t kThreadGroupSize>
                    + + + + + + + +
                    __launch_bounds__ (kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __launch_bounds__ (kForwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [9/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [10/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [11/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [12/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ false

                    + +
                    +
                    + + + + +
                    template false
                    +
                    + +
                    +
                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template float
                    +
                    + +
                    +
                    + +

                    ◆ int64_t

                    + +
                    +
                    + + + + +
                    template int64_t
                    +
                    + +
                    +
                    + +

                    ◆ true

                    + +
                    +
                    + + + + +
                    template true
                    +
                    + +
                    +
                    + +

                    ◆ uint8_t

                    + +
                    +
                    + + + + +
                    template uint8_t
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__split__weighted__codegen__cuda_8cu.html b/gen__embedding__forward__split__weighted__codegen__cuda_8cu.html new file mode 100644 index 000000000..284423188 --- /dev/null +++ b/gen__embedding__forward__split__weighted__codegen__cuda_8cu.html @@ -0,0 +1,541 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_split_weighted_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_split_weighted_codegen_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_KERNEL_FOR_CACHE_CASE

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_KERNEL_FOR_CACHE_CASE( CACHE_CASE_,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    if (CACHE_CASE_ == false) { \
                    +
                    constexpr auto _TUseCache = false; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (CACHE_CASE_ == true) { \
                    +
                    constexpr auto _TUseCache = true; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    return; \
                    +
                    }()
                    +
                    +
                    +
                    + +

                    ◆ DISPATCH_OPTIMAL_FORWARD_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_FORWARD_KERNEL( MAX_D_,
                    ... )
                    +
                    + +
                    +
                    + +

                    ◆ DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL( DD_,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    if (DD_ <= 4) { \
                    +
                    constexpr int kEmbeddingSize = 4; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 8) { \
                    +
                    constexpr int kEmbeddingSize = 8; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 16) { \
                    +
                    constexpr int kEmbeddingSize = 16; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 32) { \
                    +
                    constexpr int kEmbeddingSize = 32; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    return; \
                    +
                    }()
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , typename index_t , bool use_lxu_cache>
                    + + + + + + + + + + + +
                    __launch_bounds__ (kForwardMaxThreads ,
                    2048/ kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_weighted_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_weighted_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ B

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t B
                    +
                    + +
                    +
                    + +

                    ◆ D_offsets

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const float* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const D_offsets
                    +
                    + +
                    +
                    + +

                    ◆ fd_num_warps_per_table

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor fd_num_warps_per_table
                    +
                    + +
                    +
                    + +

                    ◆ index_weights

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const float* __restrict__ const index_weights
                    +
                    + +
                    +
                    + +

                    ◆ indices

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const indices
                    +
                    + +
                    +
                    + +

                    ◆ lxu_cache_locations

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const float* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const const int64_t* __restrict__ const const int32_t* __restrict__ const lxu_cache_locations
                    +
                    + +
                    +
                    + +

                    ◆ lxu_cache_weights

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const lxu_cache_weights
                    +
                    + +
                    +
                    + +

                    ◆ max_D_cache

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t max_D_cache
                    +
                    + +
                    +
                    + +

                    ◆ mean_pooling

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool mean_pooling
                    +
                    + +
                    +
                    + +

                    ◆ offsets

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const float* __restrict__ const const index_t* __restrict__ const offsets
                    +
                    + +
                    +
                    + +

                    ◆ output

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const float* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const const int64_t* __restrict__ const const int32_t* __restrict__ const output_t* __restrict__ const output
                    +
                    + +
                    +
                    + +

                    ◆ T

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t T
                    +
                    + +
                    +
                    + +

                    ◆ uvm_weights

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const uvm_weights
                    +
                    + +
                    +
                    + +

                    ◆ weights_offsets

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const float* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const const int64_t* __restrict__ const weights_offsets
                    +
                    + +
                    +
                    + +

                    ◆ weights_placements

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const weights_placements
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__split__weighted__codegen__meta_8cpp.html b/gen__embedding__forward__split__weighted__codegen__meta_8cpp.html new file mode 100644 index 000000000..fe8611265 --- /dev/null +++ b/gen__embedding__forward__split__weighted__codegen__meta_8cpp.html @@ -0,0 +1,213 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_split_weighted_codegen_meta.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_split_weighted_codegen_meta.cpp File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_codegen_forward_weighted_meta()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_weighted_meta (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__split__weighted__kernel_8cu.html b/gen__embedding__forward__split__weighted__kernel_8cu.html new file mode 100644 index 000000000..07ee41ff0 --- /dev/null +++ b/gen__embedding__forward__split__weighted__kernel_8cu.html @@ -0,0 +1,1123 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_split_weighted_kernel.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_split_weighted_kernel.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , bool use_lxu_cache, typename index_t , size_t kMaxVecsPerThread, size_t kThreadGroupSize>
                    + + + + + + + +
                    __launch_bounds__ (kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __launch_bounds__ (kForwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [9/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [10/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [11/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [12/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    FixedDivisor fd_B,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ false

                    + +
                    +
                    + + + + +
                    template false
                    +
                    + +
                    +
                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template float
                    +
                    + +
                    +
                    + +

                    ◆ int64_t

                    + +
                    +
                    + + + + +
                    template int64_t
                    +
                    + +
                    +
                    + +

                    ◆ true

                    + +
                    +
                    + + + + +
                    template true
                    +
                    + +
                    +
                    + +

                    ◆ uint8_t

                    + +
                    +
                    + + + + +
                    template uint8_t
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__split__weighted__v2__kernel_8cu.html b/gen__embedding__forward__split__weighted__v2__kernel_8cu.html new file mode 100644 index 000000000..64bb21550 --- /dev/null +++ b/gen__embedding__forward__split__weighted__v2__kernel_8cu.html @@ -0,0 +1,3690 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_split_weighted_v2_kernel.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    gen_embedding_forward_split_weighted_v2_kernel.cu File Reference
                    +
                    +
                    + + + + + + + + + + +

                    +Classes

                    struct  Vec4Type< T >
                     
                    struct  Vec4Type< float >
                     
                    struct  Vec4Type< at::Half >
                     
                    struct  Vec4Type< uint8_t >
                     
                    +

                    Macro Definition Documentation

                    + +

                    ◆ ACC_ADD_OR_FMA

                    + +
                    +
                    + + + + + + + + + + + +
                    #define ACC_ADD_OR_FMA( WEIGHT,
                    INDEX_WEIGHT )    accumulator.fma(WEIGHT, INDEX_WEIGHT);
                    +
                    + +
                    +
                    + +

                    ◆ DIV_ROUND_UP

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DIV_ROUND_UP( numer,
                    denom )   ((numer + denom - 1) / denom)
                    +
                    + +
                    +
                    + +

                    ◆ INVOKE_PROCESS_ALL_INDICES

                    + +
                    +
                    + + + + + + + +
                    #define INVOKE_PROCESS_ALL_INDICES( ...)
                    +
                    +Value:
                    if (use_lxu_cache) { \
                    +
                    INVOKE_PROCESS_ALL_INDICES_HELPER(true, __VA_ARGS__); \
                    +
                    } \
                    +
                    else { \
                    +
                    INVOKE_PROCESS_ALL_INDICES_HELPER(false, __VA_ARGS__); \
                    +
                    }
                    +
                    bool use_lxu_cache
                    Definition gen_embedding_forward_split_weighted_v2_kernel.cu:765
                    +
                    +
                    +
                    + +

                    ◆ INVOKE_PROCESS_ALL_INDICES_HELPER

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + +
                    #define INVOKE_PROCESS_ALL_INDICES_HELPER( USE_CACHE,
                    KERNEL_TYPE,
                    TAIL_WARP_SIZE,
                    STEP_MASK )
                    +
                    +Value:
                    process_all_indices_## KERNEL_TYPE< \
                    +
                    index_t, \
                    +
                    emb_t, \
                    +
                    emb_vec_t, \
                    +
                    cache_t, \
                    + + +
                    USE_CACHE, \
                    +
                    USE_CACHE && !std::is_same<emb_t, cache_t>::value, \
                    + +
                    STEP, \
                    +
                    STEP_MASK, \
                    +
                    TAIL_WARP_SIZE \
                    +
                    >( \
                    +
                    smem, \
                    +
                    L, \
                    +
                    load_d + (threadIdx.x % TAIL_WARP_SIZE) < load_D, \
                    + + + +
                    uint32_t load_D
                    Definition gen_embedding_forward_split_weighted_v2_kernel.cu:743
                    +
                    const uint32_t params_offset
                    Definition gen_embedding_forward_split_weighted_v2_kernel.cu:693
                    +
                    const emb_t *__restrict__ const const cache_t *__restrict__ const const int32_t *__restrict__ const const uint32_t const uint32_t const bool mean_pooling
                    Definition gen_embedding_forward_split_weighted_v2_kernel.cu:674
                    +
                    vec4_type< output_t > output_vec_t
                    Definition gen_embedding_forward_split_weighted_v2_kernel.cu:686
                    +
                    constexpr uint32_t NUM_PARAMS
                    Definition gen_embedding_forward_split_weighted_v2_kernel.cu:690
                    +
                    const emb_t *__restrict__ const const cache_t *__restrict__ const const int32_t *__restrict__ const const uint32_t const uint32_t const bool const uint32_t max_D_cache
                    Definition gen_embedding_forward_split_weighted_v2_kernel.cu:675
                    +
                    vec4_type< cache_t > cache_vec_t
                    Definition gen_embedding_forward_split_weighted_v2_kernel.cu:685
                    +
                    uint32_t L
                    Definition gen_embedding_forward_split_weighted_v2_kernel.cu:763
                    +
                    const uint32_t load_d
                    Definition gen_embedding_forward_split_weighted_v2_kernel.cu:760
                    +
                    __shared__ long smem[NUM_PARAMS *NUM_WARPS+kForwardMaxThreads]
                    Definition gen_embedding_forward_split_weighted_v2_kernel.cu:692
                    +
                    constexpr uint32_t STEP
                    Definition gen_embedding_forward_split_weighted_v2_kernel.cu:691
                    +
                    constexpr uint32_t NUM_WARPS
                    Definition gen_embedding_forward_split_weighted_v2_kernel.cu:688
                    +
                    +
                    +
                    + +

                    ◆ SMEM_CACHE_WEIGHT_DATA

                    + +
                    +
                    + + + + + + + + + + + +
                    #define SMEM_CACHE_WEIGHT_DATA( SMEM_IDX,
                    WEIGHT_IDX )    (SMEM_PTR_BASE(const cache_vec_t**)[SMEM_IDX])[WEIGHT_IDX]
                    +
                    + +
                    +
                    + +

                    ◆ SMEM_CACHE_WEIGHT_PTR

                    + +
                    +
                    + + + + +
                    #define SMEM_CACHE_WEIGHT_PTR   SMEM_PTR_BASE(const cache_vec_t**)
                    +
                    + +
                    +
                    + +

                    ◆ SMEM_EMB_WEIGHT_DATA

                    + +
                    +
                    + + + + + + + + + + + +
                    #define SMEM_EMB_WEIGHT_DATA( SMEM_IDX,
                    WEIGHT_IDX )    (SMEM_PTR_BASE(const emb_vec_t**)[SMEM_IDX])[WEIGHT_IDX]
                    +
                    + +
                    +
                    + +

                    ◆ SMEM_EMB_WEIGHT_PTR

                    + +
                    +
                    + + + + +
                    #define SMEM_EMB_WEIGHT_PTR   SMEM_PTR_BASE(const emb_vec_t**)
                    +
                    + +
                    +
                    + +

                    ◆ SMEM_GENERIC_PTR

                    + +
                    +
                    + + + + +
                    #define SMEM_GENERIC_PTR   SMEM_PTR_BASE(uintptr_t*)
                    +
                    + +
                    +
                    + +

                    ◆ SMEM_OFFSET

                    + +
                    +
                    + + + + +
                    #define SMEM_OFFSET    (IS_FULL_WARP ? j : ((threadIdx.x / LOAD_GROUP_SIZE) + (j * NUM_LOAD_GROUPS)))
                    +
                    + +
                    +
                    + +

                    ◆ SMEM_PTR_BASE

                    + +
                    +
                    + + + + + + + +
                    #define SMEM_PTR_BASE( TYPE)    (reinterpret_cast<TYPE>(smem + WEIGHT_PTR_OFFSET) + threadIdx.y * kWarpSize)
                    +
                    + +
                    +
                    + +

                    ◆ WEIGHT_OFFSET

                    + +
                    +
                    + + + + +
                    #define WEIGHT_OFFSET    (IS_FULL_WARP ? threadIdx.x : (threadIdx.x % LOAD_GROUP_SIZE))
                    +
                    + +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ cache_vec_t

                    + +
                    +
                    + + + + +
                    using cache_vec_t = vec4_type<cache_t>
                    +
                    + +
                    +
                    + +

                    ◆ output_vec_t

                    + +
                    +
                    + + + + +
                    using output_vec_t = vec4_type<output_t>
                    +
                    + +
                    +
                    + +

                    ◆ vec4_type

                    + +
                    +
                    +
                    +template<typename T >
                    + + + + +
                    using vec4_type = typename Vec4Type<T>::type
                    +
                    + +
                    +
                    +

                    Enumeration Type Documentation

                    + +

                    ◆ LXU_CACHE_PARAMS

                    + +
                    +
                    + + + + +
                    enum LXU_CACHE_PARAMS
                    +
                    + + + +
                    Enumerator
                    P_lxu_cache_weights 
                    P_lxu_cache_locations 
                    + +
                    +
                    + +

                    ◆ SAVED_PARAMS

                    + +
                    +
                    + + + + +
                    enum SAVED_PARAMS
                    +
                    + + + + + + + + + +
                    Enumerator
                    P_indices 
                    P_weights 
                    P_outputs 
                    P_index_weights 
                    P_offsets 
                    P_num_offsets 
                    P_load_D 
                    P_total_load_D 
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , typename index_t , bool USE_LXU_CACHE>
                    + + + + + + + + + + + +
                    __launch_bounds__ (kForwardMaxThreads ,
                    2048/ kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + + + + + +
                    template __launch_bounds__ (kForwardMaxThreads ,
                    2048/ kForwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ DivMod()

                    + +
                    +
                    + + + + + + + + + + + + + + + + +
                    fd_num_warps_per_table DivMod (global_warp_id ,
                    & t,
                    & table_warp_id )
                    +
                    + +
                    +
                    + +

                    ◆ false() [1/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    at::Half *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [2/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    float *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [3/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    uint8_t *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [4/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    at::Half *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [5/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    float *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [6/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    uint8_t *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [7/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    at::Half *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [8/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    float *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [9/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    uint8_t *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [10/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    at::Half *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [11/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    float *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ false() [12/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template false (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    uint8_t *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ get_next_bag_boundary_and_L()

                    + +
                    +
                    +
                    +template<uint32_t LOWER_BIT_CNT, uint32_t WARP_MASK>
                    + + + + + + + + + + + + + + + + +
                    __inline__ __device__ void get_next_bag_boundary_and_L (const uint32_t bag_boundary,
                    int32_t *const next_boundary,
                    uint32_t *const L )
                    +
                    + +
                    +
                    + +

                    ◆ if() [1/7]

                    + +
                    +
                    + + + + + + + +
                    if (is_small_L &&table_warp_id >=num_warps_for_small_L * 8)
                    +
                    + +
                    +
                    + +

                    ◆ if() [2/7]

                    + +
                    +
                    + + + + + + + +
                    if (is_small_L )
                    +
                    + +
                    +
                    + +

                    ◆ if() [3/7]

                    + +
                    +
                    + + + + + + + +
                    if (is_zero_total_L )
                    +
                    + +
                    +
                    + +

                    ◆ if() [4/7]

                    + +
                    +
                    + + + + + + + +
                    if (L<= 1)
                    +
                    + +
                    +
                    + +

                    ◆ if() [5/7]

                    + +
                    +
                    + + + + + + + +
                    if (t >= T)
                    +
                    + +
                    +
                    + +

                    ◆ if() [6/7]

                    + +
                    +
                    + + + + + + + +
                    if (table_warp_id >=num_warps_per_row * is_small_L ? num_warps_for_small_L :B)
                    +
                    + +
                    +
                    + +

                    ◆ if() [7/7]

                    + +
                    +
                    + + + + + + + +
                    if (threadIdx. x = = 0)
                    +
                    + +
                    +
                    + +

                    ◆ process_all_indices_large_Ls()

                    + +
                    +
                    +
                    +template<typename index_t , typename emb_t , typename emb_vec_t , typename cache_t , typename cache_vec_t , typename output_vec_t , bool USE_CACHE_WEIGHTS, bool USE_MIXED_TYPE_CACHE, uint32_t WEIGHT_PTR_OFFSET, uint32_t STEP, uint32_t STEP_MASK, uint32_t LOAD_GROUP_SIZE>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    __noinline__ __device__ void process_all_indices_large_Ls (long *const smem,
                    const uint32_t L,
                    const bool process_d,
                    const bool mean_pooling,
                    const uint32_t params_offset,
                    const uint32_t max_D_cache )
                    +
                    + +
                    +
                    + +

                    ◆ process_all_indices_no_pooling()

                    + +
                    +
                    +
                    +template<typename index_t , typename emb_t , typename emb_vec_t , typename output_vec_t , uint32_t STEP>
                    + + + + + + + + + + + + + + + + +
                    __inline__ __device__ void process_all_indices_no_pooling (long *const smem,
                    const bool process_d,
                    const uint32_t params_offset )
                    +
                    + +
                    +
                    + +

                    ◆ process_all_indices_small_Ls()

                    + +
                    +
                    +
                    +template<typename index_t , typename emb_t , typename emb_vec_t , typename cache_t , typename cache_vec_t , typename output_vec_t , bool USE_CACHE_WEIGHTS, bool USE_MIXED_TYPE_CACHE, uint32_t WEIGHT_PTR_OFFSET, uint32_t STEP, uint32_t STEP_MASK, uint32_t LOAD_GROUP_SIZE>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    __noinline__ __device__ void process_all_indices_small_Ls (long *const smem,
                    const uint32_t total_L,
                    const bool process_d,
                    const bool mean_pooling,
                    const uint32_t params_offset,
                    const uint32_t max_D_cache )
                    +
                    + +
                    +
                    + +

                    ◆ true() [1/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template true (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    at::Half *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [2/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template true (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    float *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [3/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template true (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    uint8_t *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [4/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template true (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    at::Half *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [5/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template true (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    float *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [6/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template true (const float *__restrict__ const dev_weights,
                    const float *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    uint8_t *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [7/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template true (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    at::Half *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [8/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template true (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    float *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [9/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template true (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const at::Half *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    uint8_t *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [10/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_v2_kernel< at template __launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_v2_kernel< at template __launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_v2_kernel< at template __launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_v2_kernel< at template true (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    at::Half *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [11/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template __launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_v2_kernel< at template __launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_v2_kernel< at template __launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_v2_kernel< at template __launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_v2_kernel< at template true (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    float *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ true() [12/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template true (const uint8_t *__restrict__ const dev_weights,
                    const uint8_t *__restrict__ const uvm_weights,
                    const float *__restrict__ const lxu_cache_weights,
                    const int32_t *__restrict__ const weights_placements,
                    const uint32_t B,
                    const uint32_t T,
                    const bool mean_pooling,
                    const uint32_t max_D_cache,
                    const FixedDivisor fd_num_warps_per_table,
                    const int64_t *__restrict__ const indices,
                    const float *__restrict__ const index_weights,
                    const int64_t *__restrict__ const offsets,
                    const uint32_t *__restrict__ const D_offsets,
                    const int64_t *__restrict__ const weights_offsets,
                    const int32_t *__restrict__ const lxu_cache_locations,
                    uint8_t *__restrict__ const output )
                    +
                    + +
                    +
                    + +

                    ◆ write_loop_small_Ls()

                    + +
                    +
                    +
                    +template<typename emb_t , typename output_vec_t , uint32_t STEP, uint32_t BOUNDARY_IDX_BIT_CNT, uint32_t WARP_MASK>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    __inline__ __device__ void write_loop_small_Ls (long *const smem,
                    uint32_t *const write_idx,
                    uint32_t *const bag_boundary,
                    int32_t *const next_boundary,
                    uint32_t *const L,
                    Vec4StepT< STEP, emb_t > *const accumulator,
                    const uint32_t params_offset,
                    const uint32_t l,
                    const bool process_d,
                    const bool mean_pooling )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ B

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t B
                    +
                    + +
                    +
                    + +

                    ◆ b

                    + +
                    +
                    + +
                    +
                    + +

                    ◆ D_offsets

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const float* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const D_offsets
                    +
                    + +
                    +
                    + +

                    ◆ D_start

                    + +
                    +
                    + + + + +
                    uint32_t D_start
                    +
                    + +
                    +
                    + +

                    ◆ fd_num_warps_per_table

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor fd_num_warps_per_table
                    +
                    + +
                    +
                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template float
                    +
                    + +
                    +
                    + +

                    ◆ global_warp_id

                    + +
                    +
                    + + + + +
                    const int32_t global_warp_id = blockIdx.x * blockDim.y + threadIdx.y
                    +
                    + +
                    +
                    + +

                    ◆ index_weights

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const float* __restrict__ const index_weights
                    +
                    + +
                    +
                    + +

                    ◆ indices

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const indices
                    +
                    + +
                    +
                    + +

                    ◆ int64_t

                    + +
                    +
                    + + + + +
                    template int64_t
                    +
                    + +
                    +
                    + +

                    ◆ is_small_L

                    + +
                    +
                    + + + + +
                    const auto is_small_L = total_L <= (static_cast<index_t>(B) * 8)
                    +
                    + +
                    +
                    + +

                    ◆ is_zero_total_L

                    + +
                    +
                    + + + + +
                    const auto is_zero_total_L = total_L == 0
                    +
                    + +
                    +
                    + +

                    ◆ L

                    + +
                    +
                    + + + + +
                    L = shfl_sync(L, 0)
                    +
                    + +
                    +
                    + +

                    ◆ load_D

                    + +
                    +
                    + + + + +
                    load_D = shfl_sync(load_D, 0)
                    +
                    + +
                    +
                    + +

                    ◆ load_d

                    + +
                    +
                    + + + + +
                    const uint32_t load_d = (table_warp_id % num_warps_per_row) * kWarpSize
                    +
                    + +
                    +
                    + +

                    ◆ lxu_cache_locations

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const float* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const const int64_t* __restrict__ const const int32_t* __restrict__ const lxu_cache_locations
                    +
                    + +
                    +
                    + +

                    ◆ lxu_cache_weights

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const lxu_cache_weights
                    +
                    + +
                    +
                    + +

                    ◆ LXU_PARAMS_CNT

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr uint32_t LXU_PARAMS_CNT = 2
                    +
                    +constexpr
                    +
                    + +
                    +
                    + +

                    ◆ max_D_cache

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t max_D_cache
                    +
                    + +
                    +
                    + +

                    ◆ mean_pooling

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool mean_pooling
                    +
                    + +
                    +
                    + +

                    ◆ NUM_OFFSETS_PER_WARP

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr uint32_t NUM_OFFSETS_PER_WARP = kWarpSize
                    +
                    +constexpr
                    +
                    + +
                    +
                    + +

                    ◆ NUM_PARAMS

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr uint32_t NUM_PARAMS = SAVED_PARAMS_CNT + (USE_LXU_CACHE ? LXU_PARAMS_CNT : 0)
                    +
                    +constexpr
                    +
                    + +
                    +
                    + +

                    ◆ NUM_WARPS

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr uint32_t NUM_WARPS = kForwardMaxThreads / kWarpSize
                    +
                    +constexpr
                    +
                    + +
                    +
                    + +

                    ◆ num_warps_for_small_L

                    + +
                    +
                    + + + + +
                    const uint32_t num_warps_for_small_L = DIV_ROUND_UP(B, NUM_OFFSETS_PER_WARP)
                    +
                    + +
                    +
                    + +

                    ◆ num_warps_per_row

                    + +
                    +
                    + + + + +
                    const uint32_t num_warps_per_row = DIV_ROUND_UP(load_D, kWarpSize)
                    +
                    + +
                    +
                    + +

                    ◆ offsets

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const float* __restrict__ const const index_t* __restrict__ const offsets
                    +
                    + +
                    +
                    + +

                    ◆ output

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const float* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const const int64_t* __restrict__ const const int32_t* __restrict__ const output_t* __restrict__ const output
                    +
                    +Initial value:
                    {
                    +
                    using emb_vec_t = vec4_type<emb_t>
                    +
                    typename Vec4Type< T >::type vec4_type
                    Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:69
                    +
                    +
                    +
                    + +

                    ◆ params_offset

                    + +
                    +
                    + + + + +
                    const uint32_t params_offset = NUM_PARAMS * threadIdx.y
                    +
                    + +
                    +
                    + +

                    ◆ row_start

                    + +
                    +
                    + + + + +
                    uint32_t row_start
                    +
                    + +
                    +
                    + +

                    ◆ SAVED_PARAMS_CNT

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr uint32_t SAVED_PARAMS_CNT = 8
                    +
                    +constexpr
                    +
                    + +
                    +
                    + +

                    ◆ smem

                    + +
                    +
                    + + + + +
                    __shared__ long smem[NUM_PARAMS *NUM_WARPS+kForwardMaxThreads]
                    +
                    + +
                    +
                    + +

                    ◆ STEP

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr uint32_t STEP = 4
                    +
                    +constexpr
                    +
                    + +
                    +
                    + +

                    ◆ T

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t T
                    +
                    + +
                    +
                    + +

                    ◆ t

                    + +
                    +
                    + + + + +
                    int32_t t
                    +
                    + +
                    +
                    + +

                    ◆ table_warp_id

                    + +
                    +
                    + + + + +
                    int32_t table_warp_id
                    +
                    + +
                    +
                    + +

                    ◆ total_L

                    + +
                    +
                    + + + + +
                    const auto total_L = offsets[(t + 1) * B] - offsets[t * B]
                    +
                    + +
                    +
                    + +

                    ◆ total_load_D

                    + +
                    +
                    + + + + +
                    uint32_t total_load_D
                    +
                    + +
                    +
                    + +

                    ◆ uint8_t

                    + +
                    +
                    + + + + +
                    template uint8_t
                    +
                    + +
                    +
                    + +

                    ◆ use_lxu_cache

                    + +
                    +
                    + + + + +
                    use_lxu_cache = USE_LXU_CACHE
                    +
                    + +
                    +
                    + +

                    ◆ uvm_weights

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const uvm_weights
                    +
                    + +
                    +
                    + +

                    ◆ VEC_WIDTH

                    + +
                    +
                    + + + + + +
                    + + + + +
                    constexpr uint32_t VEC_WIDTH = 4
                    +
                    +constexpr
                    +
                    + +
                    +
                    + +

                    ◆ weights_offsets

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const float* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const const int64_t* __restrict__ const weights_offsets
                    +
                    + +
                    +
                    + +

                    ◆ weights_placements

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const weights_placements
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html b/gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html new file mode 100644 index 000000000..6425c81a4 --- /dev/null +++ b/gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html @@ -0,0 +1,565 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu File Reference
                    +
                    +
                    +

                    Macro Definition Documentation

                    + +

                    ◆ DISPATCH_KERNEL_FOR_CACHE_CASE

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_KERNEL_FOR_CACHE_CASE( CACHE_CASE_,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    if (CACHE_CASE_ == false) { \
                    +
                    constexpr auto _TUseCache = false; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (CACHE_CASE_ == true) { \
                    +
                    constexpr auto _TUseCache = true; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    return; \
                    +
                    }()
                    +
                    +
                    +
                    + +

                    ◆ DISPATCH_OPTIMAL_FORWARD_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_FORWARD_KERNEL( MAX_D_,
                    ... )
                    +
                    + +
                    +
                    + +

                    ◆ DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL

                    + +
                    +
                    + + + + + + + + + + + +
                    #define DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL( DD_,
                    ... )
                    +
                    +Value:
                    [&] { \
                    +
                    if (DD_ <= 4) { \
                    +
                    constexpr int kEmbeddingSize = 4; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 8) { \
                    +
                    constexpr int kEmbeddingSize = 8; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 16) { \
                    +
                    constexpr int kEmbeddingSize = 16; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    if (DD_ <= 32) { \
                    +
                    constexpr int kEmbeddingSize = 32; \
                    +
                    return __VA_ARGS__(); \
                    +
                    } \
                    +
                    return; \
                    +
                    }()
                    +
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , typename index_t , bool use_lxu_cache>
                    + + + + + + + + + + + +
                    __launch_bounds__ (kForwardMaxThreads ,
                    2048/ kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_forward_weighted_vbe_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_weighted_vbe_cuda (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const Tensor & vbe_row_output_offsets,
                    const Tensor & vbe_b_t_map,
                    const int64_t vbe_output_size,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ B

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t B
                    +
                    + +
                    +
                    + +

                    ◆ D_offsets

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const float* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const D_offsets
                    +
                    + +
                    +
                    + +

                    ◆ fd_num_warps_per_table

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor fd_num_warps_per_table
                    +
                    + +
                    +
                    + +

                    ◆ index_weights

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const float* __restrict__ const index_weights
                    +
                    + +
                    +
                    + +

                    ◆ indices

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const indices
                    +
                    + +
                    +
                    + +

                    ◆ lxu_cache_locations

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const float* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const const int64_t* __restrict__ const const int32_t* __restrict__ const lxu_cache_locations
                    +
                    + +
                    +
                    + +

                    ◆ lxu_cache_weights

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const lxu_cache_weights
                    +
                    + +
                    +
                    + +

                    ◆ max_D_cache

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t max_D_cache
                    +
                    + +
                    +
                    + +

                    ◆ mean_pooling

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool mean_pooling
                    +
                    + +
                    +
                    + +

                    ◆ offsets

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const float* __restrict__ const const index_t* __restrict__ const offsets
                    +
                    + +
                    +
                    + +

                    ◆ output

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const float* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const const int64_t* __restrict__ const const int32_t* __restrict__ const output_t* __restrict__ const output
                    +
                    + +
                    +
                    + +

                    ◆ T

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t T
                    +
                    + +
                    +
                    + +

                    ◆ uvm_weights

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const uvm_weights
                    +
                    + +
                    +
                    + +

                    ◆ weights_offsets

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const const uint32_t const uint32_t const bool const uint32_t const FixedDivisor const index_t* __restrict__ const const float* __restrict__ const const index_t* __restrict__ const const uint32_t* __restrict__ const const int64_t* __restrict__ const weights_offsets
                    +
                    + +
                    +
                    + +

                    ◆ weights_placements

                    + +
                    +
                    + + + + +
                    const emb_t* __restrict__ const const cache_t* __restrict__ const const int32_t* __restrict__ const weights_placements
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__split__weighted__vbe__codegen__meta_8cpp.html b/gen__embedding__forward__split__weighted__vbe__codegen__meta_8cpp.html new file mode 100644 index 000000000..d32fa3766 --- /dev/null +++ b/gen__embedding__forward__split__weighted__vbe__codegen__meta_8cpp.html @@ -0,0 +1,238 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_split_weighted_vbe_codegen_meta.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_split_weighted_vbe_codegen_meta.cpp File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_codegen_forward_weighted_vbe_meta()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_forward_weighted_vbe_meta (const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const Tensor & indice_weights,
                    const Tensor & lxu_cache_locations,
                    const int64_t output_dtype,
                    const Tensor & vbe_row_output_offsets,
                    const Tensor & vbe_b_t_map,
                    const int64_t vbe_output_size,
                    const int64_t info_B_num_bits,
                    const int64_t info_B_mask_int64,
                    const bool is_experimental )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__forward__split__weighted__vbe__kernel_8cu.html b/gen__embedding__forward__split__weighted__vbe__kernel_8cu.html new file mode 100644 index 000000000..578865d72 --- /dev/null +++ b/gen__embedding__forward__split__weighted__vbe__kernel_8cu.html @@ -0,0 +1,1303 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_forward_split_weighted_vbe_kernel.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_forward_split_weighted_vbe_kernel.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , typename output_t , bool use_lxu_cache, typename index_t , size_t kMaxVecsPerThread, size_t kThreadGroupSize>
                    + + + + + + + +
                    __launch_bounds__ (kForwardMaxThreads ) const
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __launch_bounds__ (kForwardMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [1/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [2/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [3/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [4/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [5/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [6/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [7/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [8/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [9/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [10/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [11/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize() [12/12]

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    template kWarpSize (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                    const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                    const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map,
                    const int32_t info_B_num_bits,
                    const uint32_t info_B_mask,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets,
                    int64_t pooling_mode,
                    pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations,
                    pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ false

                    + +
                    +
                    + + + + +
                    template false
                    +
                    + +
                    +
                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template float
                    +
                    + +
                    +
                    + +

                    ◆ int64_t

                    + +
                    +
                    + + + + +
                    template int64_t
                    +
                    + +
                    +
                    + +

                    ◆ true

                    + +
                    +
                    + + + + +
                    template true
                    +
                    + +
                    +
                    + +

                    ◆ uint8_t

                    + +
                    +
                    + + + + +
                    template uint8_t
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__adagrad__split__device__kernel_8cuh.html b/gen__embedding__optimizer__adagrad__split__device__kernel_8cuh.html new file mode 100644 index 000000000..10140acec --- /dev/null +++ b/gen__embedding__optimizer__adagrad__split__device__kernel_8cuh.html @@ -0,0 +1,214 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_adagrad_split_device_kernel.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_adagrad_split_device_kernel.cuh File Reference
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_adagrad_table_update_kernel()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    DEVICE_INLINE void split_adagrad_table_update_kernel (pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & dev_weights,
                    pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & uvm_weights,
                    pta::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > & lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & sorted_lxu_cache_locations,
                    Vec4T< at::acc_type< cache_t, true > > * grad_sum,
                    const bool stochastic_rounding,
                    const at::PhiloxCudaState & stochastic_rounding_philox_args,
                    const uint32_t run_id,
                    const uint32_t cache_loc_run_id,
                    const int32_t D,
                    const int32_t t,
                    const int64_t idx,
                    const uint32_t shfl_sync_mask,
                    const int32_t shared_weight_offset,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & momentum1_offsets,
                    float eps = 0,
                    float learning_rate = 0 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__adam__split__device__kernel_8cuh.html b/gen__embedding__optimizer__adam__split__device__kernel_8cuh.html new file mode 100644 index 000000000..0721e547d --- /dev/null +++ b/gen__embedding__optimizer__adam__split__device__kernel_8cuh.html @@ -0,0 +1,254 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_adam_split_device_kernel.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_adam_split_device_kernel.cuh File Reference
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_adam_table_update_kernel()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    DEVICE_INLINE void split_adam_table_update_kernel (pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & dev_weights,
                    pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & uvm_weights,
                    pta::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > & lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & sorted_lxu_cache_locations,
                    Vec4T< at::acc_type< cache_t, true > > * grad_sum,
                    const bool stochastic_rounding,
                    const at::PhiloxCudaState & stochastic_rounding_philox_args,
                    const uint32_t run_id,
                    const uint32_t cache_loc_run_id,
                    const int32_t D,
                    const int32_t t,
                    const int64_t idx,
                    const uint32_t shfl_sync_mask,
                    const int32_t shared_weight_offset,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & momentum2_offsets,
                    float learning_rate = 0,
                    float eps = 0,
                    float beta1 = 0,
                    float beta2 = 0,
                    float weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__approx__rowwise__adagrad__split__device__kernel_8cuh.html b/gen__embedding__optimizer__approx__rowwise__adagrad__split__device__kernel_8cuh.html new file mode 100644 index 000000000..fc231ce61 --- /dev/null +++ b/gen__embedding__optimizer__approx__rowwise__adagrad__split__device__kernel_8cuh.html @@ -0,0 +1,224 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_approx_rowwise_adagrad_split_device_kernel.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_approx_rowwise_adagrad_split_device_kernel.cuh File Reference
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_approx_rowwise_adagrad_table_update_kernel()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    DEVICE_INLINE void split_approx_rowwise_adagrad_table_update_kernel (pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & dev_weights,
                    pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & uvm_weights,
                    pta::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > & lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & sorted_lxu_cache_locations,
                    Vec4T< at::acc_type< cache_t, true > > * grad_sum,
                    const bool stochastic_rounding,
                    const at::PhiloxCudaState & stochastic_rounding_philox_args,
                    const uint32_t run_id,
                    const uint32_t cache_loc_run_id,
                    const int32_t D,
                    const int32_t t,
                    const int64_t idx,
                    const uint32_t shfl_sync_mask,
                    const int32_t shared_weight_offset,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & momentum1_offsets,
                    float eps = 0,
                    float learning_rate = 0,
                    float weight_decay = 0.0,
                    int64_t weight_decay_mode = 0 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__approx__rowwise__adagrad__with__counter__split__device__kernel_8cuh.html b/gen__embedding__optimizer__approx__rowwise__adagrad__with__counter__split__device__kernel_8cuh.html new file mode 100644 index 000000000..ae6ec78e8 --- /dev/null +++ b/gen__embedding__optimizer__approx__rowwise__adagrad__with__counter__split__device__kernel_8cuh.html @@ -0,0 +1,324 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_approx_rowwise_adagrad_with_counter_split_device_kernel.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_approx_rowwise_adagrad_with_counter_split_device_kernel.cuh File Reference
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_approx_rowwise_adagrad_with_counter_table_update_kernel()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    DEVICE_INLINE void split_approx_rowwise_adagrad_with_counter_table_update_kernel (pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & dev_weights,
                    pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & uvm_weights,
                    pta::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > & lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & sorted_lxu_cache_locations,
                    Vec4T< at::acc_type< cache_t, true > > * grad_sum,
                    const bool stochastic_rounding,
                    const at::PhiloxCudaState & stochastic_rounding_philox_args,
                    const uint32_t run_id,
                    const uint32_t cache_loc_run_id,
                    const int32_t D,
                    const int32_t t,
                    const int64_t idx,
                    const uint32_t shfl_sync_mask,
                    const int32_t shared_weight_offset,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & row_counter_offsets,
                    float eps = 0,
                    float learning_rate = 0,
                    float weight_decay = 0.0,
                    int64_t iter = 0,
                    int64_t counter_halflife = -1,
                    int64_t adjustment_iter = -1,
                    float adjustment_ub = 1.0,
                    int64_t learning_rate_mode = -1,
                    int64_t weight_decay_mode = 1,
                    int64_t grad_sum_decay = -1,
                    float max_counter = 0,
                    float tail_id_threshold = 0.0,
                    int64_t is_tail_id_thresh_ratio = 0,
                    int64_t regularization_mode = 0,
                    float weight_norm_coefficient = 0.0,
                    float lower_bound = 0.0 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__approx__rowwise__adagrad__with__weight__decay__split__device__kernel_8cuh.html b/gen__embedding__optimizer__approx__rowwise__adagrad__with__weight__decay__split__device__kernel_8cuh.html new file mode 100644 index 000000000..9cb5835ec --- /dev/null +++ b/gen__embedding__optimizer__approx__rowwise__adagrad__with__weight__decay__split__device__kernel_8cuh.html @@ -0,0 +1,224 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_approx_rowwise_adagrad_with_weight_decay_split_device_kernel.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_approx_rowwise_adagrad_with_weight_decay_split_device_kernel.cuh File Reference
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_approx_rowwise_adagrad_with_weight_decay_table_update_kernel()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    DEVICE_INLINE void split_approx_rowwise_adagrad_with_weight_decay_table_update_kernel (pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & dev_weights,
                    pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & uvm_weights,
                    pta::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > & lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & sorted_lxu_cache_locations,
                    Vec4T< at::acc_type< cache_t, true > > * grad_sum,
                    const bool stochastic_rounding,
                    const at::PhiloxCudaState & stochastic_rounding_philox_args,
                    const uint32_t run_id,
                    const uint32_t cache_loc_run_id,
                    const int32_t D,
                    const int32_t t,
                    const int64_t idx,
                    const uint32_t shfl_sync_mask,
                    const int32_t shared_weight_offset,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & momentum1_offsets,
                    float eps = 0,
                    float learning_rate = 0,
                    float weight_decay = 0.0,
                    int64_t weight_decay_mode = 0 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__approx__sgd__split__device__kernel_8cuh.html b/gen__embedding__optimizer__approx__sgd__split__device__kernel_8cuh.html new file mode 100644 index 000000000..aaea8c863 --- /dev/null +++ b/gen__embedding__optimizer__approx__sgd__split__device__kernel_8cuh.html @@ -0,0 +1,189 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_approx_sgd_split_device_kernel.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_approx_sgd_split_device_kernel.cuh File Reference
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_approx_sgd_table_update_kernel()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    DEVICE_INLINE void split_approx_sgd_table_update_kernel (pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & dev_weights,
                    pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & uvm_weights,
                    pta::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > & lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & sorted_lxu_cache_locations,
                    Vec4T< at::acc_type< cache_t, true > > * grad_sum,
                    const bool stochastic_rounding,
                    const at::PhiloxCudaState & stochastic_rounding_philox_args,
                    const uint32_t run_id,
                    const uint32_t cache_loc_run_id,
                    const int32_t D,
                    const int32_t t,
                    const int64_t idx,
                    const uint32_t shfl_sync_mask,
                    const int32_t shared_weight_offset,
                    float learning_rate = 0 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__dense__split__device__kernel_8cuh.html b/gen__embedding__optimizer__dense__split__device__kernel_8cuh.html new file mode 100644 index 000000000..09a636096 --- /dev/null +++ b/gen__embedding__optimizer__dense__split__device__kernel_8cuh.html @@ -0,0 +1,189 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_dense_split_device_kernel.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_dense_split_device_kernel.cuh File Reference
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_dense_table_update_kernel()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    DEVICE_INLINE void split_dense_table_update_kernel (pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & dev_weights,
                    pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & uvm_weights,
                    pta::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > & lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & sorted_lxu_cache_locations,
                    Vec4T< at::acc_type< cache_t, true > > * grad_sum,
                    const bool stochastic_rounding,
                    const at::PhiloxCudaState & stochastic_rounding_philox_args,
                    const uint32_t run_id,
                    const uint32_t cache_loc_run_id,
                    const int32_t D,
                    const int32_t t,
                    const int64_t idx,
                    const uint32_t shfl_sync_mask,
                    const int32_t shared_weight_offset,
                    float unused = 0 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__lamb__split__device__kernel_8cuh.html b/gen__embedding__optimizer__lamb__split__device__kernel_8cuh.html new file mode 100644 index 000000000..8de3859f2 --- /dev/null +++ b/gen__embedding__optimizer__lamb__split__device__kernel_8cuh.html @@ -0,0 +1,254 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_lamb_split_device_kernel.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_lamb_split_device_kernel.cuh File Reference
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_lamb_table_update_kernel()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    DEVICE_INLINE void split_lamb_table_update_kernel (pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & dev_weights,
                    pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & uvm_weights,
                    pta::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > & lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & sorted_lxu_cache_locations,
                    Vec4T< at::acc_type< cache_t, true > > * grad_sum,
                    const bool stochastic_rounding,
                    const at::PhiloxCudaState & stochastic_rounding_philox_args,
                    const uint32_t run_id,
                    const uint32_t cache_loc_run_id,
                    const int32_t D,
                    const int32_t t,
                    const int64_t idx,
                    const uint32_t shfl_sync_mask,
                    const int32_t shared_weight_offset,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & momentum2_offsets,
                    float learning_rate = 0,
                    float eps = 0,
                    float beta1 = 0,
                    float beta2 = 0,
                    float weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__lars__sgd__split__device__kernel_8cuh.html b/gen__embedding__optimizer__lars__sgd__split__device__kernel_8cuh.html new file mode 100644 index 000000000..28b8a3213 --- /dev/null +++ b/gen__embedding__optimizer__lars__sgd__split__device__kernel_8cuh.html @@ -0,0 +1,224 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_lars_sgd_split_device_kernel.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_lars_sgd_split_device_kernel.cuh File Reference
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_lars_sgd_table_update_kernel()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    DEVICE_INLINE void split_lars_sgd_table_update_kernel (pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & dev_weights,
                    pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & uvm_weights,
                    pta::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > & lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & sorted_lxu_cache_locations,
                    Vec4T< at::acc_type< cache_t, true > > * grad_sum,
                    const bool stochastic_rounding,
                    const at::PhiloxCudaState & stochastic_rounding_philox_args,
                    const uint32_t run_id,
                    const uint32_t cache_loc_run_id,
                    const int32_t D,
                    const int32_t t,
                    const int64_t idx,
                    const uint32_t shfl_sync_mask,
                    const int32_t shared_weight_offset,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & momentum1_offsets,
                    float learning_rate = 0,
                    float eta = 0,
                    float momentum = 0,
                    float weight_decay = 0 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__none__split__device__kernel_8cuh.html b/gen__embedding__optimizer__none__split__device__kernel_8cuh.html new file mode 100644 index 000000000..9bde84f22 --- /dev/null +++ b/gen__embedding__optimizer__none__split__device__kernel_8cuh.html @@ -0,0 +1,194 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_none_split_device_kernel.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_none_split_device_kernel.cuh File Reference
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_none_table_update_kernel()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    DEVICE_INLINE void split_none_table_update_kernel (pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & dev_weights,
                    pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & uvm_weights,
                    pta::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > & lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & sorted_lxu_cache_locations,
                    Vec4T< at::acc_type< cache_t, true > > * grad_sum,
                    const bool stochastic_rounding,
                    const at::PhiloxCudaState & stochastic_rounding_philox_args,
                    const uint32_t run_id,
                    const uint32_t cache_loc_run_id,
                    const int32_t D,
                    const int32_t t,
                    const int64_t idx,
                    const uint32_t shfl_sync_mask,
                    const int32_t shared_weight_offset,
                    int64_t total_hash_size = 0,
                    int64_t total_unique_indices = 0 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__partial__rowwise__adam__split__device__kernel_8cuh.html b/gen__embedding__optimizer__partial__rowwise__adam__split__device__kernel_8cuh.html new file mode 100644 index 000000000..ed7e28aaf --- /dev/null +++ b/gen__embedding__optimizer__partial__rowwise__adam__split__device__kernel_8cuh.html @@ -0,0 +1,254 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_partial_rowwise_adam_split_device_kernel.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_partial_rowwise_adam_split_device_kernel.cuh File Reference
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_partial_rowwise_adam_table_update_kernel()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    DEVICE_INLINE void split_partial_rowwise_adam_table_update_kernel (pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & dev_weights,
                    pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & uvm_weights,
                    pta::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > & lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & sorted_lxu_cache_locations,
                    Vec4T< at::acc_type< cache_t, true > > * grad_sum,
                    const bool stochastic_rounding,
                    const at::PhiloxCudaState & stochastic_rounding_philox_args,
                    const uint32_t run_id,
                    const uint32_t cache_loc_run_id,
                    const int32_t D,
                    const int32_t t,
                    const int64_t idx,
                    const uint32_t shfl_sync_mask,
                    const int32_t shared_weight_offset,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & momentum2_offsets,
                    float learning_rate = 0,
                    float eps = 0,
                    float beta1 = 0,
                    float beta2 = 0,
                    float weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__partial__rowwise__lamb__split__device__kernel_8cuh.html b/gen__embedding__optimizer__partial__rowwise__lamb__split__device__kernel_8cuh.html new file mode 100644 index 000000000..d59e81e46 --- /dev/null +++ b/gen__embedding__optimizer__partial__rowwise__lamb__split__device__kernel_8cuh.html @@ -0,0 +1,254 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_partial_rowwise_lamb_split_device_kernel.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_partial_rowwise_lamb_split_device_kernel.cuh File Reference
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_partial_rowwise_lamb_table_update_kernel()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    DEVICE_INLINE void split_partial_rowwise_lamb_table_update_kernel (pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & dev_weights,
                    pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & uvm_weights,
                    pta::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > & lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & sorted_lxu_cache_locations,
                    Vec4T< at::acc_type< cache_t, true > > * grad_sum,
                    const bool stochastic_rounding,
                    const at::PhiloxCudaState & stochastic_rounding_philox_args,
                    const uint32_t run_id,
                    const uint32_t cache_loc_run_id,
                    const int32_t D,
                    const int32_t t,
                    const int64_t idx,
                    const uint32_t shfl_sync_mask,
                    const int32_t shared_weight_offset,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum2_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum2_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & momentum2_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & momentum2_offsets,
                    float learning_rate = 0,
                    float eps = 0,
                    float beta1 = 0,
                    float beta2 = 0,
                    float weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__rowwise__adagrad__split_8cpp.html b/gen__embedding__optimizer__rowwise__adagrad__split_8cpp.html new file mode 100644 index 000000000..a3f41433b --- /dev/null +++ b/gen__embedding__optimizer__rowwise__adagrad__split_8cpp.html @@ -0,0 +1,232 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_rowwise_adagrad_split.cpp File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_rowwise_adagrad_split.cpp File Reference
                    +
                    +
                    +
                    #include <ATen/ATen.h>
                    +#include <ATen/TypeDefault.h>
                    +#include <ATen/core/op_registration/op_registration.h>
                    +#include <torch/script.h>
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                    +#include "fbgemm_gpu/embedding_common.h"
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_rowwise_adagrad_update()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    void split_embedding_rowwise_adagrad_update (Tensor & dev_weights,
                    Tensor & uvm_weights,
                    Tensor & lxu_cache_weights,
                    const Tensor & grad_dev_weights,
                    const Tensor & grad_dev_indices,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t max_D,
                    const bool stochastic_rounding,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t weight_decay_mode = 0,
                    double max_norm = 0.0 )
                    +
                    + +
                    +
                    + +

                    ◆ TORCH_LIBRARY_FRAGMENT()

                    + +
                    +
                    + + + + + + + + + + + +
                    TORCH_LIBRARY_FRAGMENT (fbgemm ,
                    m  )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__rowwise__adagrad__split__cuda_8cu.html b/gen__embedding__optimizer__rowwise__adagrad__split__cuda_8cu.html new file mode 100644 index 000000000..fe330f877 --- /dev/null +++ b/gen__embedding__optimizer__rowwise__adagrad__split__cuda_8cu.html @@ -0,0 +1,207 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_rowwise_adagrad_split_cuda.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_rowwise_adagrad_split_cuda.cu File Reference
                    +
                    +
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_embedding_rowwise_adagrad_update()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    __global__ __launch_bounds__(kMaxThreads) void split_rowwise_adagrad_update_kernel(at void split_embedding_rowwise_adagrad_update (Tensor & dev_weights,
                    Tensor & uvm_weights,
                    Tensor & lxu_cache_weights,
                    const Tensor & grad_dev_weights,
                    const Tensor & grad_dev_indices,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const int64_t max_D,
                    const bool stochastic_rounding,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t weight_decay_mode = 0,
                    double max_norm = 0.0 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__rowwise__adagrad__split__device__kernel_8cuh.html b/gen__embedding__optimizer__rowwise__adagrad__split__device__kernel_8cuh.html new file mode 100644 index 000000000..1b00e6ab9 --- /dev/null +++ b/gen__embedding__optimizer__rowwise__adagrad__split__device__kernel_8cuh.html @@ -0,0 +1,229 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_rowwise_adagrad_split_device_kernel.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_rowwise_adagrad_split_device_kernel.cuh File Reference
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_rowwise_adagrad_table_update_kernel()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    DEVICE_INLINE void split_rowwise_adagrad_table_update_kernel (pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & dev_weights,
                    pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & uvm_weights,
                    pta::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > & lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & sorted_lxu_cache_locations,
                    Vec4T< at::acc_type< cache_t, true > > * grad_sum,
                    const bool stochastic_rounding,
                    const at::PhiloxCudaState & stochastic_rounding_philox_args,
                    const uint32_t run_id,
                    const uint32_t cache_loc_run_id,
                    const int32_t D,
                    const int32_t t,
                    const int64_t idx,
                    const uint32_t shfl_sync_mask,
                    const int32_t shared_weight_offset,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & momentum1_offsets,
                    float eps = 0,
                    float learning_rate = 0,
                    float weight_decay = 0.0,
                    int64_t weight_decay_mode = 0,
                    float max_norm = 0.0 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html b/gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html new file mode 100644 index 000000000..99ecc5858 --- /dev/null +++ b/gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html @@ -0,0 +1,434 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu File Reference
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ __launch_bounds__() [1/2]

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + +
                    __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    + +

                    ◆ __launch_bounds__() [2/2]

                    + +
                    +
                    + + + + + + + +
                    template __global__ __launch_bounds__ (kMaxThreads )
                    +
                    + +
                    +
                    +

                    Variable Documentation

                    + +

                    ◆ dev_weights

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> dev_weights
                    +
                    + +
                    +
                    + +

                    ◆ eps

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const int32_t bool at::PhiloxCudaState at::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > float eps
                    +
                    + +
                    +
                    + +

                    ◆ float

                    + +
                    +
                    + + + + +
                    template __global__ float
                    +
                    + +
                    +
                    + +

                    ◆ grad_dev_indices

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_dev_indices
                    +
                    + +
                    +
                    + +

                    ◆ grad_dev_weights

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > grad_dev_weights
                    +
                    + +
                    +
                    + +

                    ◆ kWarpSize

                    + +
                    +
                    + + + + +
                    template __global__ kWarpSize
                    +
                    + +
                    +
                    + +

                    ◆ learning_rate

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const int32_t bool at::PhiloxCudaState at::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > float float learning_rate
                    +
                    + +
                    +
                    + +

                    ◆ lxu_cache_weights

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> at::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> at::PackedTensorAccessor64<at::Half, 2, at::RestrictPtrTraits> lxu_cache_weights
                    +
                    + +
                    +
                    + +

                    ◆ max_D

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const int32_t max_D
                    +
                    + +
                    +
                    + +

                    ◆ max_norm

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const int32_t bool at::PhiloxCudaState at::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > float float float int64_t float max_norm
                    +
                    + +
                    +
                    + +

                    ◆ momentum1_dev

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const int32_t bool at::PhiloxCudaState at::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev
                    +
                    + +
                    +
                    + +

                    ◆ momentum1_offsets

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const int32_t bool at::PhiloxCudaState at::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets
                    +
                    + +
                    +
                    + +

                    ◆ momentum1_placements

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const int32_t bool at::PhiloxCudaState at::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements
                    +
                    + +
                    +
                    + +

                    ◆ momentum1_uvm

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const int32_t bool at::PhiloxCudaState at::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm
                    +
                    + +
                    +
                    + +

                    ◆ sorted_lxu_cache_locations

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations
                    +
                    + +
                    +
                    + +

                    ◆ stochastic_rounding

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const int32_t bool stochastic_rounding
                    +
                    + +
                    +
                    + +

                    ◆ stochastic_rounding_philox_args

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const int32_t bool at::PhiloxCudaState stochastic_rounding_philox_args
                    +
                    + +
                    +
                    + +

                    ◆ uvm_weights

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> at::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> uvm_weights
                    +
                    + +
                    +
                    + +

                    ◆ weight_decay

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const int32_t bool at::PhiloxCudaState at::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > float float float weight_decay
                    +
                    + +
                    +
                    + +

                    ◆ weight_decay_mode

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > const at::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > const int32_t bool at::PhiloxCudaState at::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > at::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > float float float int64_t weight_decay_mode
                    +
                    + +
                    +
                    + +

                    ◆ weights_offsets

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> at::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> at::PackedTensorAccessor64<at::Half, 2, at::RestrictPtrTraits> const at::PackedTensorAccessor32<float, 1, at::RestrictPtrTraits> const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> weights_offsets
                    +
                    + +
                    +
                    + +

                    ◆ weights_placements

                    + +
                    +
                    + + + + +
                    template __global__ at::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> at::PackedTensorAccessor64<float, 1, at::RestrictPtrTraits> at::PackedTensorAccessor64<at::Half, 2, at::RestrictPtrTraits> const at::PackedTensorAccessor32<float, 1, at::RestrictPtrTraits> const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits> weights_placements
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__rowwise__adagrad__with__counter__split__device__kernel_8cuh.html b/gen__embedding__optimizer__rowwise__adagrad__with__counter__split__device__kernel_8cuh.html new file mode 100644 index 000000000..428c91051 --- /dev/null +++ b/gen__embedding__optimizer__rowwise__adagrad__with__counter__split__device__kernel_8cuh.html @@ -0,0 +1,324 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_rowwise_adagrad_with_counter_split_device_kernel.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_rowwise_adagrad_with_counter_split_device_kernel.cuh File Reference
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_rowwise_adagrad_with_counter_table_update_kernel()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    DEVICE_INLINE void split_rowwise_adagrad_with_counter_table_update_kernel (pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & dev_weights,
                    pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & uvm_weights,
                    pta::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > & lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & sorted_lxu_cache_locations,
                    Vec4T< at::acc_type< cache_t, true > > * grad_sum,
                    const bool stochastic_rounding,
                    const at::PhiloxCudaState & stochastic_rounding_philox_args,
                    const uint32_t run_id,
                    const uint32_t cache_loc_run_id,
                    const int32_t D,
                    const int32_t t,
                    const int64_t idx,
                    const uint32_t shfl_sync_mask,
                    const int32_t shared_weight_offset,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & momentum1_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & prev_iter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & prev_iter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & prev_iter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & prev_iter_offsets,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & row_counter_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & row_counter_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & row_counter_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & row_counter_offsets,
                    float eps = 0,
                    float learning_rate = 0,
                    float weight_decay = 0.0,
                    int64_t iter = 0,
                    int64_t counter_halflife = -1,
                    int64_t adjustment_iter = -1,
                    float adjustment_ub = 1.0,
                    int64_t learning_rate_mode = -1,
                    int64_t weight_decay_mode = 1,
                    int64_t grad_sum_decay = -1,
                    float max_counter = 0,
                    float tail_id_threshold = 0.0,
                    int64_t is_tail_id_thresh_ratio = 0,
                    int64_t regularization_mode = 0,
                    float weight_norm_coefficient = 0.0,
                    float lower_bound = 0.0 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__rowwise__adagrad__with__weight__decay__split__device__kernel_8cuh.html b/gen__embedding__optimizer__rowwise__adagrad__with__weight__decay__split__device__kernel_8cuh.html new file mode 100644 index 000000000..e16f54db2 --- /dev/null +++ b/gen__embedding__optimizer__rowwise__adagrad__with__weight__decay__split__device__kernel_8cuh.html @@ -0,0 +1,224 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_rowwise_adagrad_with_weight_decay_split_device_kernel.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_rowwise_adagrad_with_weight_decay_split_device_kernel.cuh File Reference
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_rowwise_adagrad_with_weight_decay_table_update_kernel()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    DEVICE_INLINE void split_rowwise_adagrad_with_weight_decay_table_update_kernel (pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & dev_weights,
                    pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & uvm_weights,
                    pta::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > & lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & sorted_lxu_cache_locations,
                    Vec4T< at::acc_type< cache_t, true > > * grad_sum,
                    const bool stochastic_rounding,
                    const at::PhiloxCudaState & stochastic_rounding_philox_args,
                    const uint32_t run_id,
                    const uint32_t cache_loc_run_id,
                    const int32_t D,
                    const int32_t t,
                    const int64_t idx,
                    const uint32_t shfl_sync_mask,
                    const int32_t shared_weight_offset,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & momentum1_offsets,
                    float eps = 0,
                    float learning_rate = 0,
                    float weight_decay = 0.0,
                    int64_t weight_decay_mode = 0 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__rowwise__weighted__adagrad__split__device__kernel_8cuh.html b/gen__embedding__optimizer__rowwise__weighted__adagrad__split__device__kernel_8cuh.html new file mode 100644 index 000000000..bbc0f10cf --- /dev/null +++ b/gen__embedding__optimizer__rowwise__weighted__adagrad__split__device__kernel_8cuh.html @@ -0,0 +1,224 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_rowwise_weighted_adagrad_split_device_kernel.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_rowwise_weighted_adagrad_split_device_kernel.cuh File Reference
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_rowwise_weighted_adagrad_table_update_kernel()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    DEVICE_INLINE void split_rowwise_weighted_adagrad_table_update_kernel (pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & dev_weights,
                    pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & uvm_weights,
                    pta::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > & lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & sorted_lxu_cache_locations,
                    Vec4T< at::acc_type< cache_t, true > > * grad_sum,
                    const bool stochastic_rounding,
                    const at::PhiloxCudaState & stochastic_rounding_philox_args,
                    const uint32_t run_id,
                    const uint32_t cache_loc_run_id,
                    const int32_t D,
                    const int32_t t,
                    const int64_t idx,
                    const uint32_t shfl_sync_mask,
                    const int32_t shared_weight_offset,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_dev,
                    pta::PackedTensorAccessor64< at::acc_type< cache_t, true >, 1, at::RestrictPtrTraits > & momentum1_uvm,
                    pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & momentum1_placements,
                    pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & momentum1_offsets,
                    float eps = 0,
                    float learning_rate = 0,
                    float weight_decay = 0,
                    int64_t iter = 0 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/gen__embedding__optimizer__sgd__split__device__kernel_8cuh.html b/gen__embedding__optimizer__sgd__split__device__kernel_8cuh.html new file mode 100644 index 000000000..9b92e08e8 --- /dev/null +++ b/gen__embedding__optimizer__sgd__split__device__kernel_8cuh.html @@ -0,0 +1,189 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/_skbuild/linux-x86_64-3.12/cmake-build/gen_embedding_optimizer_sgd_split_device_kernel.cuh File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    +
                    gen_embedding_optimizer_sgd_split_device_kernel.cuh File Reference
                    +
                    +
                    +

                    Function Documentation

                    + +

                    ◆ split_sgd_table_update_kernel()

                    + +
                    +
                    +
                    +template<typename emb_t , typename cache_t , size_t kMaxVecsPerThread, int32_t kThreadGroupSize = kWarpSize, int32_t VEC_WIDTH>
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    DEVICE_INLINE void split_sgd_table_update_kernel (pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & dev_weights,
                    pta::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > & uvm_weights,
                    pta::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > & lxu_cache_weights,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & weights_placements,
                    const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > & weights_offsets,
                    const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > & sorted_lxu_cache_locations,
                    Vec4T< at::acc_type< cache_t, true > > * grad_sum,
                    const bool stochastic_rounding,
                    const at::PhiloxCudaState & stochastic_rounding_philox_args,
                    const uint32_t run_id,
                    const uint32_t cache_loc_run_id,
                    const int32_t D,
                    const int32_t t,
                    const int64_t idx,
                    const uint32_t shfl_sync_mask,
                    const int32_t shared_weight_offset,
                    float learning_rate = 0 )
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/general/BuildInstructions.html b/general/BuildInstructions.html index aa470130b..f42c19250 100644 --- a/general/BuildInstructions.html +++ b/general/BuildInstructions.html @@ -6,7 +6,7 @@ - + @@ -28,6 +28,8 @@ + + @@ -252,18 +254,19 @@ -

                    FBGEMM_GPU General Info

                    +

                    FBGEMM_GPU General Info

                    -

                    FBGEMM_GPU Python API

                    +

                    FBGEMM_GPU Python API

                    -

                    FBGEMM_GPU C++ API

                    +

                    FBGEMM_GPU C++ API

                    +

                    N

                    + + + +
                    +

                    P

                    - + @@ -73,10 +75,386 @@

                    Detailed Description

                    +

                    Function Documentation

                    + +

                    ◆ int_nbit_split_embedding_codegen_lookup_function_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor int_nbit_split_embedding_codegen_lookup_function_cpu (Tensor dev_weights,
                    Tensor uvm_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor weights_tys,
                    Tensor D_offsets,
                    int64_t total_D,
                    int64_t max_int2_D,
                    int64_t max_int4_D,
                    int64_t max_int8_D,
                    int64_t max_float16_D,
                    int64_t max_float32_D,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    c10::optional< Tensor > indice_weights,
                    int64_t output_dtype,
                    c10::optional< Tensor > lxu_cache_weights,
                    c10::optional< Tensor > lxu_cache_locations,
                    c10::optional< int64_t > row_alignment,
                    c10::optional< int64_t > max_float8_D,
                    c10::optional< int64_t > fp8_exponent_bits,
                    c10::optional< int64_t > fp8_exponent_bias )
                    +
                    + +
                    +
                    + +

                    ◆ int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu (Tensor dev_weights,
                    Tensor uvm_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor weights_tys,
                    Tensor D_offsets,
                    int64_t total_D,
                    int64_t max_int2_D,
                    int64_t max_int4_D,
                    int64_t max_int8_D,
                    int64_t max_float16_D,
                    int64_t max_float32_D,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    c10::optional< Tensor > indice_weights,
                    int64_t output_dtype,
                    c10::optional< Tensor > lxu_cache_weights,
                    c10::optional< Tensor > lxu_cache_locations,
                    c10::optional< int64_t > row_alignment,
                    c10::optional< int64_t > max_float8_D,
                    c10::optional< int64_t > fp8_exponent_bits,
                    c10::optional< int64_t > fp8_exponent_bias,
                    c10::optional< Tensor > cache_hash_size_cumsum,
                    c10::optional< int64_t > total_cache_hash_size,
                    c10::optional< Tensor > cache_index_table_map,
                    c10::optional< Tensor > lxu_cache_state,
                    c10::optional< Tensor > lxu_state )
                    +
                    + +
                    +
                    + +

                    ◆ pruned_array_lookup_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + +
                    Tensor pruned_array_lookup_cpu (Tensor indices,
                    Tensor offsets,
                    Tensor index_remappings,
                    Tensor index_remappings_offsets )
                    +
                    + +
                    +
                    + +

                    ◆ pruned_hashmap_insert_unweighted_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    void pruned_hashmap_insert_unweighted_cpu (Tensor indices,
                    Tensor dense_indices,
                    Tensor offsets,
                    Tensor hash_table,
                    Tensor hash_table_offsets )
                    +
                    + +
                    +
                    + +

                    ◆ pruned_hashmap_lookup_unweighted_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + +
                    Tensor pruned_hashmap_lookup_unweighted_cpu (Tensor indices,
                    Tensor offsets,
                    Tensor hash_table,
                    Tensor hash_table_offsets )
                    +
                    + +
                    +
                    diff --git a/group__embedding-cuda.html b/group__embedding-cuda.html index 3e1350ec3..38046379c 100644 --- a/group__embedding-cuda.html +++ b/group__embedding-cuda.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: Embedding CUDA Operators + + @@ -29,7 +31,7 @@ - + @@ -77,11 +79,188 @@ - +

                    Functions

                    Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function (Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t pooling_mode, c10::optional< Tensor > indice_weights, int64_t output_dtype, c10::optional< Tensor > lxu_cache_weights, c10::optional< Tensor > lxu_cache_locations, c10::optional< int64_t > row_alignment, c10::optional< int64_t > max_float8_D, c10::optional< int64_t > fp8_exponent_bits, c10::optional< int64_t > fp8_exponent_bias, c10::optional< Tensor > cache_hash_size_cumsum, c10::optional< int64_t > total_cache_hash_size, c10::optional< Tensor > cache_index_table_map, c10::optional< Tensor > lxu_cache_state, c10::optional< Tensor > lxu_state)
                    Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function (Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t pooling_mode, c10::optional< Tensor > indice_weights, int64_t output_dtype, c10::optional< Tensor > lxu_cache_weights, c10::optional< Tensor > lxu_cache_locations, c10::optional< int64_t > row_alignment, c10::optional< int64_t > max_float8_D, c10::optional< int64_t > fp8_exponent_bits, c10::optional< int64_t > fp8_exponent_bias, c10::optional< Tensor > cache_hash_size_cumsum, c10::optional< int64_t > total_cache_hash_size, c10::optional< Tensor > cache_index_table_map, c10::optional< Tensor > lxu_cache_state, c10::optional< Tensor > lxu_state)
                     

                    Detailed Description

                    Function Documentation

                    + +

                    ◆ bounds_check_indices_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    void bounds_check_indices_cuda (Tensor & rows_per_table,
                    Tensor & indices,
                    Tensor & offsets,
                    int64_t bounds_check_mode,
                    Tensor & warning,
                    const c10::optional< Tensor > & weights,
                    const c10::optional< Tensor > & B_ofsets,
                    const int64_t max_B )
                    +
                    + +
                    +
                    + +

                    ◆ int_nbit_split_embedding_codegen_lookup_function()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor int_nbit_split_embedding_codegen_lookup_function (Tensor dev_weights,
                    Tensor uvm_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor weights_tys,
                    Tensor D_offsets,
                    int64_t total_D,
                    int64_t max_int2_D,
                    int64_t max_int4_D,
                    int64_t max_int8_D,
                    int64_t max_float16_D,
                    int64_t max_float32_D,
                    Tensor indices,
                    Tensor offsets,
                    int64_t pooling_mode,
                    c10::optional< Tensor > indice_weights,
                    int64_t output_dtype,
                    c10::optional< Tensor > lxu_cache_weights,
                    c10::optional< Tensor > lxu_cache_locations,
                    c10::optional< int64_t > row_alignment,
                    c10::optional< int64_t > max_float8_D,
                    c10::optional< int64_t > fp8_exponent_bits,
                    c10::optional< int64_t > fp8_exponent_bias )
                    +
                    + +
                    +

                    ◆ int_nbit_split_embedding_uvm_caching_codegen_lookup_function()

                    @@ -89,188 +268,3633 @@

                    - + - - + + + + + + + + + + + + + + + + + + + + + + + + + + - - + - - + - - + - - + - - + - - + - - + - - + - - + - - + - - + - - + - - + - - + - - + - - + - - + - - + - - + - - + - - + - - + + +
                    Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function Tensor int_nbit_split_embedding_uvm_caching_codegen_lookup_function (Tensor dev_weights, Tensor dev_weights,
                    Tensor uvm_weights,
                    Tensor weights_placements,
                    Tensor weights_offsets,
                    Tensor weights_tys,
                    Tensor D_offsets,
                    Tensor uvm_weights, int64_t total_D,
                    Tensor weights_placements, int64_t max_int2_D,
                    Tensor weights_offsets, int64_t max_int4_D,
                    Tensor weights_tys, int64_t max_int8_D,
                    Tensor D_offsets, int64_t max_float16_D,
                    int64_t total_D, int64_t max_float32_D,
                    int64_t max_int2_D, Tensor indices,
                    int64_t max_int4_D, Tensor offsets,
                    int64_t max_int8_D, int64_t pooling_mode,
                    int64_t max_float16_D, c10::optional< Tensor > indice_weights,
                    int64_t max_float32_D, int64_t output_dtype,
                    Tensor indices, c10::optional< Tensor > lxu_cache_weights,
                    Tensor offsets, c10::optional< Tensor > lxu_cache_locations,
                    int64_t pooling_mode, c10::optional< int64_t > row_alignment,
                    c10::optional< Tensor > indice_weights, c10::optional< int64_t > max_float8_D,
                    int64_t output_dtype, c10::optional< int64_t > fp8_exponent_bits,
                    c10::optional< Tensor > lxu_cache_weights, c10::optional< int64_t > fp8_exponent_bias,
                    c10::optional< Tensor > lxu_cache_locations, c10::optional< Tensor > cache_hash_size_cumsum,
                    c10::optional< int64_t > row_alignment, c10::optional< int64_t > total_cache_hash_size,
                    c10::optional< int64_t > max_float8_D, c10::optional< Tensor > cache_index_table_map,
                    c10::optional< int64_t > fp8_exponent_bits, c10::optional< Tensor > lxu_cache_state,
                    c10::optional< int64_t > fp8_exponent_bias, c10::optional< Tensor > lxu_state )
                    +
                    +

                    Simlar to int_nbit_split_embedding_codegen_lookup_function, but it does UVM_CACHING lookup.

                    + +
                    + +
                    +

                    ◆ pruned_array_lookup_cuda()

                    + +
                    +
                    + + + + + - - + - - + - - + + +
                    Tensor pruned_array_lookup_cuda (Tensor indices,
                    c10::optional< Tensor > cache_hash_size_cumsum, Tensor offsets,
                    c10::optional< int64_t > total_cache_hash_size, Tensor index_remappings,
                    c10::optional< Tensor > cache_index_table_map, Tensor index_remappings_offsets )
                    +
                    + +
                    +
                    + +

                    ◆ pruned_hashmap_lookup_cuda()

                    + +
                    +
                    + + + + + - - + - - + + - - + + +
                    Tensor pruned_hashmap_lookup_cuda (Tensor indices,
                    c10::optional< Tensor > lxu_cache_state, Tensor offsets,
                    c10::optional< Tensor > lxu_state Tensor hash_table,
                    )Tensor hash_table_offsets )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_lookup_adagrad_function()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_lookup_adagrad_function (const Tensor & placeholder_autograd_tensor,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const c10::optional< Tensor > & indice_weights,
                    const c10::optional< Tensor > & feature_requires_grad,
                    const Tensor & lxu_cache_locations,
                    const bool gradient_clipping,
                    const double max_gradient,
                    const bool stochastic_rounding,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32),
                    const c10::optional< Tensor > & B_offsets = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_output_offsets_feature_rank = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(),
                    const int64_t max_B = -1,
                    const int64_t max_B_feature_rank = -1,
                    const int64_t vbe_output_size = -1,
                    const bool is_experimental = false,
                    const bool use_uniq_cache_locations_bwd = false,
                    const bool use_homogeneous_placements = false )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_lookup_adam_function()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_lookup_adam_function (const Tensor & placeholder_autograd_tensor,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const c10::optional< Tensor > & indice_weights,
                    const c10::optional< Tensor > & feature_requires_grad,
                    const Tensor & lxu_cache_locations,
                    const bool gradient_clipping,
                    const double max_gradient,
                    const bool stochastic_rounding,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate = 0,
                    double eps = 0,
                    double beta1 = 0,
                    double beta2 = 0,
                    double weight_decay = 0,
                    int64_t iter = 0,
                    const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32),
                    const c10::optional< Tensor > & B_offsets = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_output_offsets_feature_rank = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(),
                    const int64_t max_B = -1,
                    const int64_t max_B_feature_rank = -1,
                    const int64_t vbe_output_size = -1,
                    const bool is_experimental = false,
                    const bool use_uniq_cache_locations_bwd = false,
                    const bool use_homogeneous_placements = false )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_lookup_approx_rowwise_adagrad_function()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_lookup_approx_rowwise_adagrad_function (const Tensor & placeholder_autograd_tensor,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const c10::optional< Tensor > & indice_weights,
                    const c10::optional< Tensor > & feature_requires_grad,
                    const Tensor & lxu_cache_locations,
                    const bool gradient_clipping,
                    const double max_gradient,
                    const bool stochastic_rounding,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t weight_decay_mode = 0,
                    const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32),
                    const c10::optional< Tensor > & B_offsets = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_output_offsets_feature_rank = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(),
                    const int64_t max_B = -1,
                    const int64_t max_B_feature_rank = -1,
                    const int64_t vbe_output_size = -1,
                    const bool is_experimental = false,
                    const bool use_uniq_cache_locations_bwd = false,
                    const bool use_homogeneous_placements = false )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function (const Tensor & placeholder_autograd_tensor,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const c10::optional< Tensor > & indice_weights,
                    const c10::optional< Tensor > & feature_requires_grad,
                    const Tensor & lxu_cache_locations,
                    const bool gradient_clipping,
                    const double max_gradient,
                    const bool stochastic_rounding,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor prev_iter_dev,
                    Tensor prev_iter_uvm,
                    Tensor prev_iter_placements,
                    Tensor prev_iter_offsets,
                    Tensor row_counter_dev,
                    Tensor row_counter_uvm,
                    Tensor row_counter_placements,
                    Tensor row_counter_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t iter = 0,
                    int64_t counter_halflife = -1,
                    int64_t adjustment_iter = -1,
                    double adjustment_ub = 1.0,
                    int64_t learning_rate_mode = -1,
                    int64_t weight_decay_mode = 1,
                    int64_t grad_sum_decay = -1,
                    double max_counter = 0,
                    double tail_id_threshold = 0.0,
                    int64_t is_tail_id_thresh_ratio = 0,
                    int64_t regularization_mode = 0,
                    double weight_norm_coefficient = 0.0,
                    double lower_bound = 0.0,
                    const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32),
                    const c10::optional< Tensor > & B_offsets = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_output_offsets_feature_rank = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(),
                    const int64_t max_B = -1,
                    const int64_t max_B_feature_rank = -1,
                    const int64_t vbe_output_size = -1,
                    const bool is_experimental = false,
                    const bool use_uniq_cache_locations_bwd = false,
                    const bool use_homogeneous_placements = false )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function (const Tensor & placeholder_autograd_tensor,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const c10::optional< Tensor > & indice_weights,
                    const c10::optional< Tensor > & feature_requires_grad,
                    const Tensor & lxu_cache_locations,
                    const bool gradient_clipping,
                    const double max_gradient,
                    const bool stochastic_rounding,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t weight_decay_mode = 0,
                    const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32),
                    const c10::optional< Tensor > & B_offsets = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_output_offsets_feature_rank = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(),
                    const int64_t max_B = -1,
                    const int64_t max_B_feature_rank = -1,
                    const int64_t vbe_output_size = -1,
                    const bool is_experimental = false,
                    const bool use_uniq_cache_locations_bwd = false,
                    const bool use_homogeneous_placements = false )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_lookup_approx_sgd_function()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_lookup_approx_sgd_function (const Tensor & placeholder_autograd_tensor,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const c10::optional< Tensor > & indice_weights,
                    const c10::optional< Tensor > & feature_requires_grad,
                    const Tensor & lxu_cache_locations,
                    const bool gradient_clipping,
                    const double max_gradient,
                    const bool stochastic_rounding,
                    double learning_rate = 0,
                    const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32),
                    const c10::optional< Tensor > & B_offsets = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_output_offsets_feature_rank = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(),
                    const int64_t max_B = -1,
                    const int64_t max_B_feature_rank = -1,
                    const int64_t vbe_output_size = -1,
                    const bool is_experimental = false,
                    const bool use_uniq_cache_locations_bwd = false,
                    const bool use_homogeneous_placements = false )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_lookup_lamb_function()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_lookup_lamb_function (const Tensor & placeholder_autograd_tensor,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const c10::optional< Tensor > & indice_weights,
                    const c10::optional< Tensor > & feature_requires_grad,
                    const Tensor & lxu_cache_locations,
                    const bool gradient_clipping,
                    const double max_gradient,
                    const bool stochastic_rounding,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate = 0,
                    double eps = 0,
                    double beta1 = 0,
                    double beta2 = 0,
                    double weight_decay = 0,
                    int64_t iter = 0,
                    const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32),
                    const c10::optional< Tensor > & B_offsets = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_output_offsets_feature_rank = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(),
                    const int64_t max_B = -1,
                    const int64_t max_B_feature_rank = -1,
                    const int64_t vbe_output_size = -1,
                    const bool is_experimental = false,
                    const bool use_uniq_cache_locations_bwd = false,
                    const bool use_homogeneous_placements = false )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_lookup_lars_sgd_function()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_lookup_lars_sgd_function (const Tensor & placeholder_autograd_tensor,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const c10::optional< Tensor > & indice_weights,
                    const c10::optional< Tensor > & feature_requires_grad,
                    const Tensor & lxu_cache_locations,
                    const bool gradient_clipping,
                    const double max_gradient,
                    const bool stochastic_rounding,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double learning_rate = 0,
                    double eta = 0,
                    double momentum = 0,
                    double weight_decay = 0,
                    const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32),
                    const c10::optional< Tensor > & B_offsets = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_output_offsets_feature_rank = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(),
                    const int64_t max_B = -1,
                    const int64_t max_B_feature_rank = -1,
                    const int64_t vbe_output_size = -1,
                    const bool is_experimental = false,
                    const bool use_uniq_cache_locations_bwd = false,
                    const bool use_homogeneous_placements = false )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_lookup_none_function()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_lookup_none_function (const Tensor & placeholder_autograd_tensor,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const c10::optional< Tensor > & indice_weights,
                    const c10::optional< Tensor > & feature_requires_grad,
                    const Tensor & lxu_cache_locations,
                    int64_t total_hash_size = 0,
                    int64_t total_unique_indices = 0,
                    const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32),
                    const c10::optional< Tensor > & B_offsets = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_output_offsets_feature_rank = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(),
                    const int64_t max_B = -1,
                    const int64_t max_B_feature_rank = -1,
                    const int64_t vbe_output_size = -1,
                    const bool is_experimental = false,
                    const bool use_uniq_cache_locations_bwd = false,
                    const bool use_homogeneous_placements = false )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_lookup_partial_rowwise_adam_function()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_lookup_partial_rowwise_adam_function (const Tensor & placeholder_autograd_tensor,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const c10::optional< Tensor > & indice_weights,
                    const c10::optional< Tensor > & feature_requires_grad,
                    const Tensor & lxu_cache_locations,
                    const bool gradient_clipping,
                    const double max_gradient,
                    const bool stochastic_rounding,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate = 0,
                    double eps = 0,
                    double beta1 = 0,
                    double beta2 = 0,
                    double weight_decay = 0,
                    int64_t iter = 0,
                    const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32),
                    const c10::optional< Tensor > & B_offsets = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_output_offsets_feature_rank = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(),
                    const int64_t max_B = -1,
                    const int64_t max_B_feature_rank = -1,
                    const int64_t vbe_output_size = -1,
                    const bool is_experimental = false,
                    const bool use_uniq_cache_locations_bwd = false,
                    const bool use_homogeneous_placements = false )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_lookup_partial_rowwise_lamb_function()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_lookup_partial_rowwise_lamb_function (const Tensor & placeholder_autograd_tensor,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const c10::optional< Tensor > & indice_weights,
                    const c10::optional< Tensor > & feature_requires_grad,
                    const Tensor & lxu_cache_locations,
                    const bool gradient_clipping,
                    const double max_gradient,
                    const bool stochastic_rounding,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor momentum2_dev,
                    Tensor momentum2_uvm,
                    Tensor momentum2_placements,
                    Tensor momentum2_offsets,
                    double learning_rate = 0,
                    double eps = 0,
                    double beta1 = 0,
                    double beta2 = 0,
                    double weight_decay = 0,
                    int64_t iter = 0,
                    const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32),
                    const c10::optional< Tensor > & B_offsets = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_output_offsets_feature_rank = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(),
                    const int64_t max_B = -1,
                    const int64_t max_B_feature_rank = -1,
                    const int64_t vbe_output_size = -1,
                    const bool is_experimental = false,
                    const bool use_uniq_cache_locations_bwd = false,
                    const bool use_homogeneous_placements = false )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_lookup_rowwise_adagrad_function()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_lookup_rowwise_adagrad_function (const Tensor & placeholder_autograd_tensor,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const c10::optional< Tensor > & indice_weights,
                    const c10::optional< Tensor > & feature_requires_grad,
                    const Tensor & lxu_cache_locations,
                    const bool gradient_clipping,
                    const double max_gradient,
                    const bool stochastic_rounding,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t weight_decay_mode = 0,
                    double max_norm = 0.0,
                    const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32),
                    const c10::optional< Tensor > & B_offsets = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_output_offsets_feature_rank = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(),
                    const int64_t max_B = -1,
                    const int64_t max_B_feature_rank = -1,
                    const int64_t vbe_output_size = -1,
                    const bool is_experimental = false,
                    const bool use_uniq_cache_locations_bwd = false,
                    const bool use_homogeneous_placements = false )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function (const Tensor & placeholder_autograd_tensor,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const c10::optional< Tensor > & indice_weights,
                    const c10::optional< Tensor > & feature_requires_grad,
                    const Tensor & lxu_cache_locations,
                    const bool gradient_clipping,
                    const double max_gradient,
                    const bool stochastic_rounding,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    Tensor prev_iter_dev,
                    Tensor prev_iter_uvm,
                    Tensor prev_iter_placements,
                    Tensor prev_iter_offsets,
                    Tensor row_counter_dev,
                    Tensor row_counter_uvm,
                    Tensor row_counter_placements,
                    Tensor row_counter_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t iter = 0,
                    int64_t counter_halflife = -1,
                    int64_t adjustment_iter = -1,
                    double adjustment_ub = 1.0,
                    int64_t learning_rate_mode = -1,
                    int64_t weight_decay_mode = 1,
                    int64_t grad_sum_decay = -1,
                    double max_counter = 0,
                    double tail_id_threshold = 0.0,
                    int64_t is_tail_id_thresh_ratio = 0,
                    int64_t regularization_mode = 0,
                    double weight_norm_coefficient = 0.0,
                    double lower_bound = 0.0,
                    const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32),
                    const c10::optional< Tensor > & B_offsets = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_output_offsets_feature_rank = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(),
                    const int64_t max_B = -1,
                    const int64_t max_B_feature_rank = -1,
                    const int64_t vbe_output_size = -1,
                    const bool is_experimental = false,
                    const bool use_uniq_cache_locations_bwd = false,
                    const bool use_homogeneous_placements = false )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function (const Tensor & placeholder_autograd_tensor,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const c10::optional< Tensor > & indice_weights,
                    const c10::optional< Tensor > & feature_requires_grad,
                    const Tensor & lxu_cache_locations,
                    const bool gradient_clipping,
                    const double max_gradient,
                    const bool stochastic_rounding,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0.0,
                    int64_t weight_decay_mode = 0,
                    const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32),
                    const c10::optional< Tensor > & B_offsets = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_output_offsets_feature_rank = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(),
                    const int64_t max_B = -1,
                    const int64_t max_B_feature_rank = -1,
                    const int64_t vbe_output_size = -1,
                    const bool is_experimental = false,
                    const bool use_uniq_cache_locations_bwd = false,
                    const bool use_homogeneous_placements = false )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_lookup_rowwise_weighted_adagrad_function()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_lookup_rowwise_weighted_adagrad_function (const Tensor & placeholder_autograd_tensor,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const c10::optional< Tensor > & indice_weights,
                    const c10::optional< Tensor > & feature_requires_grad,
                    const Tensor & lxu_cache_locations,
                    const bool gradient_clipping,
                    const double max_gradient,
                    const bool stochastic_rounding,
                    Tensor momentum1_dev,
                    Tensor momentum1_uvm,
                    Tensor momentum1_placements,
                    Tensor momentum1_offsets,
                    double eps = 0,
                    double learning_rate = 0,
                    double weight_decay = 0,
                    int64_t iter = 0,
                    const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32),
                    const c10::optional< Tensor > & B_offsets = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_output_offsets_feature_rank = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(),
                    const int64_t max_B = -1,
                    const int64_t max_B_feature_rank = -1,
                    const int64_t vbe_output_size = -1,
                    const bool is_experimental = false,
                    const bool use_uniq_cache_locations_bwd = false,
                    const bool use_homogeneous_placements = false )
                    +
                    + +
                    +
                    + +

                    ◆ split_embedding_codegen_lookup_sgd_function()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor split_embedding_codegen_lookup_sgd_function (const Tensor & placeholder_autograd_tensor,
                    const Tensor & dev_weights,
                    const Tensor & uvm_weights,
                    const Tensor & lxu_cache_weights,
                    const Tensor & weights_placements,
                    const Tensor & weights_offsets,
                    const Tensor & D_offsets,
                    const int64_t total_D,
                    const int64_t max_D,
                    const Tensor & hash_size_cumsum,
                    const int64_t total_hash_size_bits,
                    const Tensor & indices,
                    const Tensor & offsets,
                    const int64_t pooling_mode,
                    const c10::optional< Tensor > & indice_weights,
                    const c10::optional< Tensor > & feature_requires_grad,
                    const Tensor & lxu_cache_locations,
                    const bool gradient_clipping,
                    const double max_gradient,
                    const bool stochastic_rounding,
                    double learning_rate = 0,
                    const int64_t output_dtype = static_cast<int64_t>(SparseType::FP32),
                    const c10::optional< Tensor > & B_offsets = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_output_offsets_feature_rank = c10::optional<Tensor>(),
                    const c10::optional< Tensor > & vbe_B_offsets_rank_per_feature = c10::optional<Tensor>(),
                    const int64_t max_B = -1,
                    const int64_t max_B_feature_rank = -1,
                    const int64_t vbe_output_size = -1,
                    const bool is_experimental = false,
                    const bool use_uniq_cache_locations_bwd = false,
                    const bool use_homogeneous_placements = false )
                    -

                    Simlar to int_nbit_split_embedding_codegen_lookup_function, but it does UVM_CACHING lookup.

                    diff --git a/group__input-combine.html b/group__input-combine.html index 7e4fb4d79..106252d61 100644 --- a/group__input-combine.html +++ b/group__input-combine.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: Combine Input Operators + + @@ -29,7 +31,7 @@ - + @@ -73,10 +75,78 @@

                    Detailed Description

                    +

                    Function Documentation

                    + +

                    ◆ padding_fused_tbe_input_combine_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    std::tuple< Tensor, Tensor, Tensor > padding_fused_tbe_input_combine_cpu (const std::vector< at::Tensor > & indices_list,
                    const std::vector< at::Tensor > & offsets_list,
                    const std::vector< at::Tensor > & per_sample_weights,
                    const at::Tensor & include_last_offsets,
                    int64_t batch_size )
                    +
                    + +
                    +
                    + +

                    ◆ tbe_input_combine_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + +
                    std::tuple< Tensor, Tensor, Tensor > tbe_input_combine_cpu (const std::vector< at::Tensor > & indices_list,
                    const std::vector< at::Tensor > & offsets_list,
                    const std::vector< at::Tensor > & per_sample_weights,
                    const at::Tensor & include_last_offsets )
                    +
                    + +
                    +
                    diff --git a/group__jagged-tensor-ops-cpu.html b/group__jagged-tensor-ops-cpu.html index c13fa834e..47118c6e8 100644 --- a/group__jagged-tensor-ops-cpu.html +++ b/group__jagged-tensor-ops-cpu.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: Jagged Tensor Operators + + @@ -29,7 +31,7 @@ - + @@ -77,42 +79,145 @@ - + - +

                    Functions

                    Tensor jagged_dense_elementwise_add (const Tensor &x_values, const std::vector< Tensor > &x_offsets, const Tensor &y)
                    Tensor jagged_dense_elementwise_add (const Tensor &x_values, const std::vector< Tensor > &x_offsets, const Tensor &y)
                     
                    std::tuple< Tensor, std::vector< Tensor > > jagged_dense_elementwise_add_jagged_output (const Tensor &x_values, const std::vector< Tensor > &x_offsets, const Tensor &y)
                    std::tuple< Tensor, std::vector< Tensor > > jagged_dense_elementwise_add_jagged_output (const Tensor &x_values, const std::vector< Tensor > &x_offsets, const Tensor &y)
                     

                    Detailed Description

                    +

                    The following are Jagged Tensor CPU Operators

                    +

                    The following are Jagged Tensor CPU Operators

                    The following are Jagged Tensor CPU Operators

                    Function Documentation

                    - -

                    ◆ jagged_dense_elementwise_add()

                    + +

                    ◆ batched_dense_vec_jagged_2d_mul()

                    + +
                    +
                    + + + + + + + + + + + + + + + + +
                    Tensor batched_dense_vec_jagged_2d_mul (const Tensor & v,
                    const Tensor & a_values,
                    const Tensor & a_offsets )
                    +
                    + +
                    +
                    + +

                    ◆ dense_to_jagged()

                    + +
                    +
                    + + + + + + + + + + + + + + + + +
                    std::tuple< Tensor, std::vector< Tensor > > dense_to_jagged (const Tensor & dense,
                    const std::vector< Tensor > & offsets,
                    c10::optional< at::SymInt > total_L )
                    +
                    + +
                    +
                    + +

                    ◆ jagged_1d_to_dense()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + +
                    Tensor jagged_1d_to_dense (Tensor values,
                    Tensor offsets,
                    c10::SymInt max_L,
                    int64_t padding_value )
                    +
                    + +
                    +
                    + +

                    ◆ jagged_2d_to_dense()

                    - + - - + + + + + + - - + + +
                    Tensor jagged_dense_elementwise_add Tensor jagged_2d_to_dense (const Tensor & x_values, Tensor values,
                    Tensor offsets,
                    const std::vector< Tensor > & x_offsets, c10::SymInt max_sequence_length )
                    +
                    + +
                    +
                    + +

                    ◆ jagged_dense_elementwise_add()

                    + + + +

                    ◆ jagged_dense_elementwise_mul()

                    + +
                    +
                    + + + + + + + + + + + + + + + + +
                    std::tuple< Tensor, std::vector< Tensor > > jagged_dense_elementwise_mul (const Tensor & x_values,
                    const std::vector< Tensor > & x_offsets,
                    const Tensor & y )
                    +
                    + +
                    +
                    + +

                    ◆ jagged_to_padded_dense()

                    + +
                    +
                    + + + - - + - - + - - + + - - +
                    Tensor jagged_to_padded_dense (const Tensor & x_values, const Tensor & values,
                    const std::vector< Tensor > & x_offsets, const std::vector< Tensor > & offsets,
                    const Tensor & y const c10::SymIntArrayRef max_lengths,
                    )const double padding_value )
                    -

                    Output = x + y where x is jagged, y is dense, and output is jagged

                    diff --git a/group__jagged-tensor-ops-cuda.html b/group__jagged-tensor-ops-cuda.html index 73fb339e5..2d1dc75b6 100644 --- a/group__jagged-tensor-ops-cuda.html +++ b/group__jagged-tensor-ops-cuda.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: Jagged Tensor CUDA Operators + + @@ -29,7 +31,7 @@ - + @@ -77,7 +79,7 @@ - +

                    Functions

                    std::tuple< Tensor, std::vector< Tensor > > jagged_dense_elementwise_add_jagged_output_cuda (const Tensor &x_values, const std::vector< Tensor > &x_offsets, const Tensor &y)
                    std::tuple< Tensor, std::vector< Tensor > > jagged_dense_elementwise_add_jagged_output_cuda (const Tensor &x_values, const std::vector< Tensor > &x_offsets, const Tensor &y)
                     

                    Detailed Description

                    @@ -90,38 +92,61 @@

                    - + - - + - - + - - + +
                    std::tuple< Tensor, std::vector< Tensor > > jagged_dense_elementwise_add_jagged_output_cuda std::tuple< Tensor, std::vector< Tensor > > jagged_dense_elementwise_add_jagged_output_cuda (const Tensor & x_values, const Tensor & x_values,
                    const std::vector< Tensor > & x_offsets, const std::vector< Tensor > & x_offsets,
                    const Tensor & y const Tensor & y )
                    +
                    +

                    output = x + y where x is jagged, y is dense, and output is jagged

                    + +
                    + +
                    +

                    ◆ jagged_to_padded_dense_forward()

                    + +
                    +
                    + + + + + + + + + + + + - - + + + + + +
                    at::Tensor jagged_to_padded_dense_forward (const Tensor & values,
                    const std::vector< Tensor > & offsets,
                    )c10::SymIntArrayRef max_lengths,
                    const double padding_value )
                    -

                    output = x + y where x is jagged, y is dense, and output is jagged

                    diff --git a/group__layout-transform-cpu.html b/group__layout-transform-cpu.html index 475054cf3..5a86a4e7e 100644 --- a/group__layout-transform-cpu.html +++ b/group__layout-transform-cpu.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: Layout Transformation CPU Operators + + @@ -29,7 +31,7 @@ - + @@ -73,10 +75,32 @@

                    Detailed Description

                    +

                    Function Documentation

                    + +

                    ◆ recat_embedding_grad_output_mixed_D_cpu()

                    + +
                    +
                    + + + + + + + + + + + +
                    Tensor recat_embedding_grad_output_mixed_D_cpu (const Tensor & grad_output,
                    const std::vector< int64_t > & dim_sum_per_rank )
                    +
                    + +
                    +
                    diff --git a/group__layout-transform-cuda.html b/group__layout-transform-cuda.html index 0745132f1..f28df6a8f 100644 --- a/group__layout-transform-cuda.html +++ b/group__layout-transform-cuda.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: Layout Transformation CUDA Operators + + @@ -29,7 +31,7 @@ - + @@ -73,10 +75,79 @@

                    Detailed Description

                    +

                    Function Documentation

                    + +

                    ◆ recat_embedding_grad_output_cuda()

                    + +
                    +
                    + + + + + + + + + + + +
                    Tensor recat_embedding_grad_output_cuda (Tensor grad_output,
                    const std::vector< int64_t > & num_features_per_rank )
                    +
                    + +
                    +
                    + +

                    ◆ recat_embedding_grad_output_mixed_D_batch_cuda()

                    + +
                    +
                    + + + + + + + + + + + + + + + + +
                    Tensor recat_embedding_grad_output_mixed_D_batch_cuda (const Tensor & grad_output,
                    const Tensor & dim_sum_per_rank,
                    const Tensor & cumsum_dim_sum_per_rank )
                    +
                    + +
                    +
                    + +

                    ◆ recat_embedding_grad_output_mixed_D_cuda()

                    + +
                    +
                    + + + + + + + + + + + +
                    Tensor recat_embedding_grad_output_mixed_D_cuda (const Tensor & grad_output,
                    const std::vector< int64_t > & dim_sum_per_rank )
                    +
                    + +
                    +
                    diff --git a/group__merge-pooled-emb.html b/group__merge-pooled-emb.html index ea22c207f..2ad44c108 100644 --- a/group__merge-pooled-emb.html +++ b/group__merge-pooled-emb.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: Merge Operators + + @@ -29,7 +31,7 @@ - + @@ -73,10 +75,32 @@

                    Detailed Description

                    +

                    Function Documentation

                    + +

                    ◆ all_to_one_device()

                    + +
                    +
                    + + + + + + + + + + + +
                    std::vector< Tensor > all_to_one_device (std::vector< at::Tensor > inputTensors,
                    at::Device target_device )
                    +
                    + +
                    +
                    diff --git a/group__permute-pooled-embs-cpu.html b/group__permute-pooled-embs-cpu.html index 1955ee7ae..51ace5e26 100644 --- a/group__permute-pooled-embs-cpu.html +++ b/group__permute-pooled-embs-cpu.html @@ -3,12 +3,14 @@ - + -fbgemm_gpu: CPU Permutation Operators +fbgemm_gpu: Permute Pooled Embeddings Operators (CPU) + + @@ -29,7 +31,7 @@ - + @@ -69,14 +71,200 @@
                    -
                    CPU Permutation Operators
                    +
                    Permute Pooled Embeddings Operators (CPU)

                    Detailed Description

                    +

                    Function Documentation

                    + +

                    ◆ permute_pooled_embs_auto_grad()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    at::Tensor permute_pooled_embs_auto_grad (const Tensor & pooled_embs,
                    const Tensor & offset_dim_list,
                    const Tensor & permute_list,
                    const Tensor & inv_offset_dim_list,
                    const Tensor & inv_permute_list )
                    +
                    + +
                    +
                    + +

                    ◆ permute_pooled_embs_auto_grad_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    at::Tensor permute_pooled_embs_auto_grad_cpu (const Tensor & pooled_embs,
                    const Tensor & offset_dim_list,
                    const Tensor & permute_list,
                    const Tensor & inv_offset_dim_list,
                    const Tensor & inv_permute_list )
                    +
                    + +
                    +
                    + +

                    ◆ permute_pooled_embs_auto_grad_split_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor permute_pooled_embs_auto_grad_split_cpu (const at::Tensor & pooled_embs,
                    const at::Tensor & offset_dim_list,
                    const at::Tensor & permute_list,
                    const at::Tensor & inv_offset_dim_list,
                    const at::Tensor & inv_permute_list )
                    +
                    + +
                    +
                    + +

                    ◆ permute_pooled_embs_cpu_impl()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor permute_pooled_embs_cpu_impl (const at::Tensor & pooled_embs,
                    const at::Tensor & offset_dim_list,
                    const at::Tensor & permute_list,
                    const at::Tensor & inv_offset_dim_list,
                    const at::Tensor & inv_permute_list,
                    const bool & allow_duplicates )
                    +
                    + +
                    +
                    + +

                    ◆ permute_pooled_embs_split_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor permute_pooled_embs_split_cpu (const at::Tensor & pooled_embs,
                    const at::Tensor & offset_dim_list,
                    const at::Tensor & permute_list,
                    const at::Tensor & inv_offset_dim_list,
                    const at::Tensor & inv_permute_list )
                    +
                    + +
                    +
                    diff --git a/group__permute-pooled-embs-gpu.html b/group__permute-pooled-embs-gpu.html index 341b6c148..16a7b5513 100644 --- a/group__permute-pooled-embs-gpu.html +++ b/group__permute-pooled-embs-gpu.html @@ -3,12 +3,14 @@ - + -fbgemm_gpu: CUDA Permutation Operators +fbgemm_gpu: Permute Pooled Embeddings Operators (CUDA) + + @@ -29,7 +31,7 @@ - + @@ -69,14 +71,123 @@
                    -
                    CUDA Permutation Operators
                    +
                    Permute Pooled Embeddings Operators (CUDA)

                    Detailed Description

                    +

                    Function Documentation

                    + +

                    ◆ permute_pooled_embs_auto_grad_gpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor permute_pooled_embs_auto_grad_gpu (const Tensor & pooled_embs,
                    const Tensor & offset_dim_list,
                    const Tensor & permute_list,
                    const Tensor & inv_offset_dim_list,
                    const Tensor & inv_permute_list )
                    +
                    + +
                    +
                    + +

                    ◆ permute_pooled_embs_auto_grad_split_gpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor permute_pooled_embs_auto_grad_split_gpu (const at::Tensor & pooled_embs,
                    const at::Tensor & offset_dim_list,
                    const at::Tensor & permute_list,
                    const at::Tensor & inv_offset_dim_list,
                    const at::Tensor & inv_permute_list )
                    +
                    + +
                    +
                    + +

                    ◆ permute_pooled_embs_split_gpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    Tensor permute_pooled_embs_split_gpu (const at::Tensor & pooled_embs,
                    const at::Tensor & offset_dim_list,
                    const at::Tensor & permute_list,
                    const at::Tensor & inv_offset_dim_list,
                    const at::Tensor & inv_permute_list )
                    +
                    + +
                    +
                    diff --git a/group__quantize-data-cpu.html b/group__quantize-data-cpu.html index acdf74192..b6ba0c1bd 100644 --- a/group__quantize-data-cpu.html +++ b/group__quantize-data-cpu.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: Quantize Data CPU Operators + + @@ -29,7 +31,7 @@ - + @@ -74,10 +76,361 @@

                    Detailed Description

                    The following are CPU Operators

                    +

                    Function Documentation

                    + +

                    ◆ _float_to_fused8bitrowwise_cpu_out()

                    + +
                    +
                    + + + + + + + + + + + +
                    Tensor & _float_to_fused8bitrowwise_cpu_out (Tensor & output,
                    const Tensor & input )
                    +
                    + +
                    +
                    + +

                    ◆ _fused8bitrowwise_to_float_cpu_out()

                    + +
                    +
                    + + + + + + + + + + + +
                    Tensor & _fused8bitrowwise_to_float_cpu_out (Tensor & output,
                    const Tensor & input )
                    +
                    + +
                    +
                    + +

                    ◆ float_or_half_to_fused8bitrowwise_cpu()

                    + +
                    +
                    + + + + + + + +
                    Tensor float_or_half_to_fused8bitrowwise_cpu (const Tensor & input)
                    +
                    + +
                    +
                    + +

                    ◆ float_to_FP8rowwise_cpu()

                    + +
                    +
                    + + + + + + + + + + + +
                    Tensor float_to_FP8rowwise_cpu (const Tensor & input,
                    bool forward )
                    +
                    + +
                    +
                    + +

                    ◆ float_to_fused8bitrowwise_cpu()

                    + +
                    +
                    + + + + + + + +
                    Tensor float_to_fused8bitrowwise_cpu (const Tensor & input)
                    +
                    + +
                    +
                    + +

                    ◆ FloatToFP8Quantized_ref()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    void FloatToFP8Quantized_ref (const float *const input,
                    const size_t nrows,
                    const size_t ncols,
                    uint8_t *const output,
                    const int ebits,
                    const int exponent_bias,
                    const double max_pos )
                    +
                    + +
                    +
                    + +

                    ◆ FP8QuantizedToFloat_ref()

                    + +
                    +
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    void FP8QuantizedToFloat_ref (const uint8_t *const input,
                    const size_t nrows,
                    const size_t ncols,
                    float *const output,
                    const int ebits,
                    const int exponent_bias )
                    +
                    + +
                    +
                    + +

                    ◆ FP8rowwise_to_float_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + +
                    Tensor FP8rowwise_to_float_cpu (const Tensor & input,
                    bool forward,
                    const int64_t output_dtype )
                    +
                    + +
                    +
                    + +

                    ◆ fused8bitrowwise_to_float_cpu()

                    + +
                    +
                    + + + + + + + +
                    Tensor fused8bitrowwise_to_float_cpu (const Tensor & input)
                    +
                    + +
                    +
                    + +

                    ◆ fused8bitrowwise_to_float_or_half_cpu()

                    + +
                    +
                    + + + + + + + + + + + +
                    Tensor fused8bitrowwise_to_float_or_half_cpu (const Tensor & input,
                    const int64_t output_dtype )
                    +
                    + +
                    +
                    + +

                    ◆ fused8bitrowwise_to_half_cpu()

                    + +
                    +
                    + + + + + + + +
                    Tensor fused8bitrowwise_to_half_cpu (const Tensor & input)
                    +
                    + +
                    +
                    + +

                    ◆ fusednbitrowwise_to_float_cpu()

                    + +
                    +
                    + + + + + + + + + + + +
                    Tensor fusednbitrowwise_to_float_cpu (const Tensor & input,
                    const int64_t bit_rate )
                    +
                    + +
                    +
                    + +

                    ◆ fusednbitrowwise_to_float_or_half_cpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + +
                    Tensor fusednbitrowwise_to_float_or_half_cpu (const Tensor & input,
                    const int64_t bit_rate,
                    const int64_t output_dtype )
                    +
                    + +
                    +
                    + +

                    ◆ fusednbitrowwise_to_half_cpu()

                    + +
                    +
                    + + + + + + + + + + + +
                    Tensor fusednbitrowwise_to_half_cpu (const Tensor & input,
                    const int64_t bit_rate )
                    +
                    + +
                    +
                    + +

                    ◆ half_to_fused8bitrowwise_cpu()

                    + +
                    +
                    + + + + + + + +
                    Tensor half_to_fused8bitrowwise_cpu (const Tensor & input)
                    +
                    + +
                    +
                    diff --git a/group__quantize-ops-cuda.html b/group__quantize-ops-cuda.html index e9e2f38c9..c382fca4e 100644 --- a/group__quantize-ops-cuda.html +++ b/group__quantize-ops-cuda.html @@ -3,12 +3,14 @@ - + -fbgemm_gpu: Quantization Operators for CUDA +fbgemm_gpu: Quantization Operators (CUDA) + + @@ -29,7 +31,7 @@ - + @@ -71,27 +73,26 @@
                    -
                    Quantization Operators for CUDA
                    +
                    Quantization Operators (CUDA)
                    - + - + - + - + - + - +

                    Functions

                    DLL_PUBLIC at::Tensor _float_to_bfloat16_gpu (const at::Tensor &input)
                    DLL_PUBLIC at::Tensor _float_to_bfloat16_gpu (const at::Tensor &input)
                     
                    DLL_PUBLIC at::Tensor _bfloat16_to_float_gpu (const at::Tensor &input)
                    DLL_PUBLIC at::Tensor _bfloat16_to_float_gpu (const at::Tensor &input)
                     
                    DLL_PUBLIC at::Tensor _float_to_hfp8_gpu (const at::Tensor &input, const int64_t ebits, const int64_t exponent_bias, const double max_pos)
                    DLL_PUBLIC at::Tensor _float_to_hfp8_gpu (const at::Tensor &input, const int64_t ebits, const int64_t exponent_bias, const double max_pos)
                     
                    DLL_PUBLIC at::Tensor _hfp8_to_float_gpu (const at::Tensor &input, const int64_t ebits, const int64_t exponent_bias)
                    DLL_PUBLIC at::Tensor _hfp8_to_float_gpu (const at::Tensor &input, const int64_t ebits, const int64_t exponent_bias)
                     
                    DLL_PUBLIC at::Tensor _float_to_msfp_gpu (const at::Tensor &input, const int64_t bounding_box_size, const int64_t ebits, const int64_t mbits, const int64_t bias, const double min_pos, const double max_pos)
                    DLL_PUBLIC at::Tensor _float_to_msfp_gpu (const at::Tensor &input, const int64_t bounding_box_size, const int64_t ebits, const int64_t mbits, const int64_t bias, const double min_pos, const double max_pos)
                     
                    DLL_PUBLIC at::Tensor _msfp_to_float_gpu (const at::Tensor &input, const int64_t ebits, const int64_t mbits, const int64_t bias)
                    DLL_PUBLIC at::Tensor _msfp_to_float_gpu (const at::Tensor &input, const int64_t ebits, const int64_t mbits, const int64_t bias)
                     

                    Detailed Description

                    -

                    The following are CUDA Operators

                    Function Documentation

                    ◆ _bfloat16_to_float_gpu()

                    @@ -100,10 +101,9 @@

                    - + - - +
                    DLL_PUBLIC at::Tensor _bfloat16_to_float_gpu DLL_PUBLIC at::Tensor _bfloat16_to_float_gpu (const at::Tensor & input)const at::Tensor & input)
                    @@ -119,10 +119,9 @@

                    - + - - +
                    DLL_PUBLIC at::Tensor _float_to_bfloat16_gpu DLL_PUBLIC at::Tensor _float_to_bfloat16_gpu (const at::Tensor & input)const at::Tensor & input)
                    @@ -131,40 +130,113 @@

                    -

                    ◆ _float_to_hfp8_gpu()

                    + +

                    ◆ _float_to_FP8rowwise_gpu()

                    - + - - + - - + + +
                    DLL_PUBLIC at::Tensor _float_to_hfp8_gpu DLL_PUBLIC Tensor _float_to_FP8rowwise_gpu (const at::Tensor & input, const Tensor & input,
                    const int64_t ebits, const bool forward )
                    +
                    + +
                    +
                    + +

                    ◆ _float_to_fused8bitrowwise_gpu()

                    + +
                    +
                    + + + + + + + +
                    DLL_PUBLIC Tensor _float_to_fused8bitrowwise_gpu (const Tensor & input)
                    +
                    + +
                    +
                    + +

                    ◆ _float_to_fusednbitrowwise_gpu()

                    + +
                    +
                    + + + + + - - + + +
                    DLL_PUBLIC Tensor _float_to_fusednbitrowwise_gpu (const Tensor & input,
                    const int64_t exponent_bias, const int64_t bit_rate )
                    +
                    + +
                    +
                    + +

                    ◆ _float_to_fusednbitrowwise_gpu_t()

                    + +
                    +
                    +
                    +template<typename input_t >
                    + + + + + - - + +
                    Tensor _float_to_fusednbitrowwise_gpu_t (const Tensor & input,
                    const double max_pos const int64_t bit_rate )
                    +
                    + +
                    +
                    + +

                    ◆ _float_to_hfp8_gpu()

                    + + + +

                    ◆ _float_to_paddedFP8rowwise_gpu()

                    + +
                    +
                    + + + + + + + + + + + - - +
                    DLL_PUBLIC Tensor _float_to_paddedFP8rowwise_gpu (const Tensor & input,
                    const bool forward,
                    )const int64_t row_dim )
                    -

                    Converts a tensor of float values into a tensor of Microsoft Floating Point (msfp) values.

                    - -

                    ◆ _hfp8_to_float_gpu()

                    + +

                    ◆ _fused8bitrowwise_to_float_mixed_dim_gpu()

                    + +
                    +
                    + + + + + + + + + + + + + + + + +
                    DLL_PUBLIC at::Tensor _fused8bitrowwise_to_float_mixed_dim_gpu (const at::Tensor & input,
                    const at::Tensor & D_offsets,
                    const int64_t output_dtype )
                    +
                    + +
                    +
                    + +

                    ◆ _fused8bitrowwise_to_single_or_half_precision_gpu()

                    + +
                    +
                    + + + + + + + + + + + +
                    DLL_PUBLIC at::Tensor _fused8bitrowwise_to_single_or_half_precision_gpu (const at::Tensor & input,
                    const int64_t output_dtype )
                    +
                    + +
                    +
                    + +

                    ◆ _fusednbitrowwise_to_float_gpu_t()

                    + +
                    +
                    +
                    +template<typename output_t >
                    + + + + + + + + + + + +
                    Tensor _fusednbitrowwise_to_float_gpu_t (const Tensor & input,
                    const int64_t bit_rate )
                    +
                    + +
                    +
                    + +

                    ◆ _fusednbitrowwise_to_float_or_half_gpu()

                    - + - - + - - + - - + + +
                    DLL_PUBLIC at::Tensor _hfp8_to_float_gpu DLL_PUBLIC at::Tensor _fusednbitrowwise_to_float_or_half_gpu (const at::Tensor & input, const at::Tensor & input,
                    const int64_t ebits, const int64_t bit_rate,
                    const int64_t exponent_bias const int64_t output_dtype )
                    +
                    + +
                    +
                    + +

                    ◆ _fusednbitrowwise_to_half_gpu()

                    + +
                    +
                    + + + + + + - - + + +
                    DLL_PUBLIC at::Tensor _fusednbitrowwise_to_half_gpu (const at::Tensor & input,
                    )const int64_t bit_rate )
                    +
                    + +
                    +
                    + +

                    ◆ _half_to_fusednbitrowwise_gpu()

                    + +
                    +
                    + + + + + + + + + + + +
                    DLL_PUBLIC at::Tensor _half_to_fusednbitrowwise_gpu (const at::Tensor & input,
                    const int64_t bit_rate )
                    +
                    + +
                    +
                    + +

                    ◆ _hfp8_to_float_gpu()

                    + + + +

                    ◆ _single_or_half_precision_to_fused8bitrowwise_gpu()

                    + +
                    +
                    + + + + - -
                    DLL_PUBLIC Tensor _single_or_half_precision_to_fused8bitrowwise_gpu (const Tensor & input) )
                    -

                    Converts a tensor of Microsoft Floating Point (msfp) values into a tensor of float values.

                    diff --git a/group__sparse-data-cpu.html b/group__sparse-data-cpu.html index e9ef7bf33..9c527a9ee 100644 --- a/group__sparse-data-cpu.html +++ b/group__sparse-data-cpu.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: Sparse Data CPU Operators + + @@ -29,7 +31,7 @@ - + @@ -77,9 +79,9 @@ - + - +

                    Functions

                    std::tuple< at::Tensor, at::Tensor > histogram_binning_calibration_cpu (const at::Tensor &logit, const at::Tensor &bin_num_examples, const at::Tensor &bin_num_positives, double positive_weight, double lower_bound=0.0, double upper_bound=1.0, int64_t bin_ctr_in_use_after=0, double bin_ctr_weight_value=1.0)
                    std::tuple< at::Tensor, at::Tensor > histogram_binning_calibration_cpu (const at::Tensor &logit, const at::Tensor &bin_num_examples, const at::Tensor &bin_num_positives, double positive_weight, double lower_bound=0.0, double upper_bound=1.0, int64_t bin_ctr_in_use_after=0, double bin_ctr_weight_value=1.0)
                     
                    std::tuple< at::Tensor, at::Tensor > generic_histogram_binning_calibration_by_feature_cpu (const at::Tensor &logit, const at::Tensor &segment_value, const at::Tensor &segment_lengths, int64_t num_segments, const at::Tensor &bin_num_examples, const at::Tensor &bin_num_positives, const at::Tensor &bin_boundaries, double positive_weight, int64_t bin_ctr_in_use_after=0, double bin_ctr_weight_value=1.0)
                    std::tuple< at::Tensor, at::Tensor > generic_histogram_binning_calibration_by_feature_cpu (const at::Tensor &logit, const at::Tensor &segment_value, const at::Tensor &segment_lengths, int64_t num_segments, const at::Tensor &bin_num_examples, const at::Tensor &bin_num_positives, const at::Tensor &bin_boundaries, double positive_weight, int64_t bin_ctr_in_use_after=0, double bin_ctr_weight_value=1.0)
                     

                    Detailed Description

                    @@ -92,69 +94,54 @@

                    - + - - + - - + - - + - - + - - + - - + - - + - - + - - + - - - - - - - +
                    std::tuple< Tensor, Tensor > generic_histogram_binning_calibration_by_feature_cpu std::tuple< Tensor, Tensor > generic_histogram_binning_calibration_by_feature_cpu (const at::Tensor & logit, const at::Tensor & logit,
                    const at::Tensor & segment_value, const at::Tensor & segment_value,
                    const at::Tensor & segment_lengths, const at::Tensor & segment_lengths,
                    int64_t num_segments, int64_t num_segments,
                    const at::Tensor & bin_num_examples, const at::Tensor & bin_num_examples,
                    const at::Tensor & bin_num_positives, const at::Tensor & bin_num_positives,
                    const at::Tensor & bin_boundaries, const at::Tensor & bin_boundaries,
                    double positive_weight, double positive_weight,
                    int64_t bin_ctr_in_use_after = 0, int64_t bin_ctr_in_use_after = 0,
                    double bin_ctr_weight_value = 1.0 
                    )double bin_ctr_weight_value = 1.0 )
                    @@ -171,8 +158,9 @@

                    segment_value/lengthsValues and lengths in KeyJaggedTensor. Assumes value of length is either 0 or 1. num_bins# of bins is no longer the same as bin_num_examples, and bin_num_positives, all of which should be still the same size. lower/upper_boundBounds of the bins. - bin_ctr_in_use_afterWe will use the calibration_target for the final calibrated prediction if we don't have sufficient examples. Only use the statistical value of bin CTR after we observe bin_ctr_in_use_after examples that fall in this bin. Default value is 0. @parambin_ctr_weight_value Weight for statistical value of bin CTR. When this is specified, we perform a weighted sum for the statisctical bin CTR and the calibration_target:
                    @@ -249,8 +224,8 @@

                    positive_weightis passed as input argument. The number of bins is automatically derived from bin_num_examples, and bin_num_positives, all of which should be the same size. lower/upper_boundBounds of the bins. bin_ctr_in_use_afterWe will use the calibration_target for the final calibrated prediction if we don't have sufficient examples. Only use the statistical value of bin CTR after we observe bin_ctr_in_use_after examples that fall in this bin. Default value: 0. - bin_ctr_weight_valueWeight for statistical value of bin CTR. When this is specified, we perform a weighted sum for the statisctical bin CTR and the calibration_target:
                    final_calibrated_prediction = bin_ctr_weight * bin_ctr + (1 -
                    -
                    bin_ctr_weight) * calibration_target
                    + bin_ctr_weight_valueWeight for statistical value of bin CTR. When this is specified, we perform a weighted sum for the statisctical bin CTR and the calibration_target:
                    Default value: 1.0 @@ -261,7 +236,7 @@

                    diff --git a/group__sparse-data-cuda.html b/group__sparse-data-cuda.html index 3b095afec..e6f5c8213 100644 --- a/group__sparse-data-cuda.html +++ b/group__sparse-data-cuda.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: Sparse Data CUDA Operators + + @@ -29,7 +31,7 @@

                    - + @@ -77,46 +79,58 @@ - +

                    Functions

                    at::Tensor expand_into_jagged_permute_cuda (const at::Tensor &permute, const at::Tensor &input_offsets, const at::Tensor &output_offsets, int64_t output_size)
                    at::Tensor expand_into_jagged_permute_cuda (const at::Tensor &permute, const at::Tensor &input_offsets, const at::Tensor &output_offsets, int64_t output_size)
                     

                    Detailed Description

                    The following are CUDA operators

                    Function Documentation

                    - -

                    ◆ expand_into_jagged_permute_cuda()

                    + +

                    ◆ _float_or_half_to_fusednbitrowwise_gpu()

                    - + - - + - - + + +
                    DLL_PUBLIC Tensor expand_into_jagged_permute_cuda DLL_PUBLIC Tensor _float_or_half_to_fusednbitrowwise_gpu (const at::Tensor & permute, const Tensor & input,
                    const at::Tensor & input_offsets, const int64_t bit_rate )
                    +
                    + +
                    +
                    + +

                    ◆ expand_into_jagged_permute_cuda()

                    + +
                    +
                    + + + + + - - + - - + + - - +
                    DLL_PUBLIC Tensor expand_into_jagged_permute_cuda (const at::Tensor & permute,
                    const at::Tensor & output_offsets, const at::Tensor & input_offsets,
                    int64_t output_size const at::Tensor & output_offsets,
                    )int64_t output_size )
                    @@ -129,7 +143,11 @@

                    Returns
                    The output follows the following formula:
                    output_permute[table_offset[permute[table]] + batch] <- bag_offset[batch]
                    +
                    Returns
                    The output follows the following formula:
                    +
                    Definition fbgemm_tensor_accessor.h:128
                    +
                    index_t table_offset
                    Definition sparse_batched_unary_embeddings.cu:36
                    +
                    __global__ const offsets_t *__restrict__ int32_t const index_t *__restrict__ permute
                    Definition sparse_expand_into_jagged_permute.cu:23
                    +
                    __global__ const offsets_t *__restrict__ int32_t const index_t *__restrict__ index_t *__restrict__ output_permute
                    Definition sparse_expand_into_jagged_permute.cu:24
                    @@ -137,7 +155,7 @@

                    diff --git a/group__table-batched-embed-cuda.html b/group__table-batched-embed-cuda.html index 4fca1967a..447a75d94 100644 --- a/group__table-batched-embed-cuda.html +++ b/group__table-batched-embed-cuda.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: CUDA Operators + + @@ -29,7 +31,7 @@

                    - + @@ -77,35 +79,35 @@ - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +

                    Functions

                    std::tuple< at::Tensor, at::Tensor, c10::optional< at::Tensor > > get_unique_indices_cuda (at::Tensor linear_indices, int64_t max_indices, bool compute_count)
                    std::tuple< at::Tensor, at::Tensor, c10::optional< at::Tensor > > get_unique_indices_cuda (at::Tensor linear_indices, int64_t max_indices, bool compute_count)
                     
                    std::pair< at::Tensor, at::Tensor > lru_cache_find_uncached_cuda (at::Tensor unique_indices, at::Tensor unique_indices_length, int64_t max_indices, at::Tensor lxu_cache_state, int64_t time_stamp, at::Tensor lru_state, bool gather_cache_stats, at::Tensor uvm_cache_stats, bool lock_cache_line, at::Tensor lxu_cache_locking_counter)
                    std::pair< at::Tensor, at::Tensorlru_cache_find_uncached_cuda (at::Tensor unique_indices, at::Tensor unique_indices_length, int64_t max_indices, at::Tensor lxu_cache_state, int64_t time_stamp, at::Tensor lru_state, bool gather_cache_stats, at::Tensor uvm_cache_stats, bool lock_cache_line, at::Tensor lxu_cache_locking_counter)
                     
                    int64_t host_lxu_cache_slot (int64_t h_in, int64_t C)
                    int64_t host_lxu_cache_slot (int64_t h_in, int64_t C)
                     
                    at::Tensor linearize_cache_indices_cuda (at::Tensor cache_hash_size_cumsum, at::Tensor indices, at::Tensor offsets)
                    at::Tensor linearize_cache_indices_cuda (at::Tensor cache_hash_size_cumsum, at::Tensor indices, at::Tensor offsets)
                     
                    at::Tensor linearize_cache_indices_from_row_idx_cuda (at::Tensor cache_hash_size_cumsum, at::Tensor update_table_indices, at::Tensor update_row_indices)
                    at::Tensor linearize_cache_indices_from_row_idx_cuda (at::Tensor cache_hash_size_cumsum, at::Tensor update_table_indices, at::Tensor update_row_indices)
                     
                    void lru_cache_populate_cuda (at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, bool stochastic_rounding, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats, bool lock_cache_line, c10::optional< at::Tensor > lxu_cache_locking_counter)
                    void lru_cache_populate_cuda (at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, bool stochastic_rounding, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats, bool lock_cache_line, c10::optional< at::Tensor > lxu_cache_locking_counter)
                     
                    void lru_cache_populate_byte_cuda (at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, int64_t row_alignment, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats)
                    void lru_cache_populate_byte_cuda (at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, int64_t row_alignment, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats)
                     
                    void direct_mapped_lru_cache_populate_byte_cuda (at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, at::Tensor lxu_cache_miss_timestamp, int64_t row_alignment, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats)
                    void direct_mapped_lru_cache_populate_byte_cuda (at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, at::Tensor lxu_cache_miss_timestamp, int64_t row_alignment, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats)
                     
                    void lfu_cache_populate_cuda (at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, bool stochastic_rounding)
                    void lfu_cache_populate_cuda (at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, bool stochastic_rounding)
                     
                    void lfu_cache_populate_byte_cuda (at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, int64_t row_alignment)
                    void lfu_cache_populate_byte_cuda (at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, int64_t row_alignment)
                     
                    at::Tensor lxu_cache_lookup_cuda (at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats, c10::optional< at::Tensor > num_uniq_cache_indices, c10::optional< at::Tensor > lxu_cache_locations_output)
                    at::Tensor lxu_cache_lookup_cuda (at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats, c10::optional< at::Tensor > num_uniq_cache_indices, c10::optional< at::Tensor > lxu_cache_locations_output)
                     
                    at::Tensor direct_mapped_lxu_cache_lookup_cuda (at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats)
                    at::Tensor direct_mapped_lxu_cache_lookup_cuda (at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats)
                     
                    void lxu_cache_flush_cuda (at::Tensor uvm_weights, at::Tensor cache_hash_size_cumsum, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, int64_t total_D, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, bool stochastic_rounding)
                    void lxu_cache_flush_cuda (at::Tensor uvm_weights, at::Tensor cache_hash_size_cumsum, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, int64_t total_D, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, bool stochastic_rounding)
                     
                    void lxu_cache_locking_counter_decrement_cuda (at::Tensor lxu_cache_locking_counter, at::Tensor lxu_cache_locations)
                    void lxu_cache_locking_counter_decrement_cuda (at::Tensor lxu_cache_locking_counter, at::Tensor lxu_cache_locations)
                     
                    void lxu_cache_locations_update_cuda (at::Tensor lxu_cache_locations, at::Tensor lxu_cache_locations_new, c10::optional< at::Tensor > num_uniq_cache_indices)
                    void lxu_cache_locations_update_cuda (at::Tensor lxu_cache_locations, at::Tensor lxu_cache_locations_new, c10::optional< at::Tensor > num_uniq_cache_indices)
                     

                    Detailed Description

                    @@ -120,103 +122,82 @@

                    void direct_mapped_lru_cache_populate_byte_cuda ( - at::Tensor  - weights, + at::Tensor weights, - at::Tensor  - hash_size_cumsum, + at::Tensor hash_size_cumsum, - int64_t  - total_cache_hash_size, + int64_t total_cache_hash_size, - at::Tensor  - cache_index_table_map, + at::Tensor cache_index_table_map, - at::Tensor  - weights_offsets, + at::Tensor weights_offsets, - at::Tensor  - weights_tys, + at::Tensor weights_tys, - at::Tensor  - D_offsets, + at::Tensor D_offsets, - at::Tensor  - linear_cache_indices, + at::Tensor linear_cache_indices, - at::Tensor  - lxu_cache_state, + at::Tensor lxu_cache_state, - at::Tensor  - lxu_cache_weights, + at::Tensor lxu_cache_weights, - int64_t  - time_stamp, + int64_t time_stamp, - at::Tensor  - lru_state, + at::Tensor lru_state, - at::Tensor  - lxu_cache_miss_timestamp, + at::Tensor lxu_cache_miss_timestamp, - int64_t  - row_alignment, + int64_t row_alignment, - bool  - gather_cache_stats, + bool gather_cache_stats, - c10::optional< at::Tensor >  - uvm_cache_stats  - - - - ) - + c10::optional< at::Tensor > uvm_cache_stats )

                    @@ -663,97 +555,77 @@

                    void lru_cache_populate_byte_cuda ( - at::Tensor  - weights, + at::Tensor weights, - at::Tensor  - hash_size_cumsum, + at::Tensor hash_size_cumsum, - int64_t  - total_cache_hash_size, + int64_t total_cache_hash_size, - at::Tensor  - cache_index_table_map, + at::Tensor cache_index_table_map, - at::Tensor  - weights_offsets, + at::Tensor weights_offsets, - at::Tensor  - weights_tys, + at::Tensor weights_tys, - at::Tensor  - D_offsets, + at::Tensor D_offsets, - at::Tensor  - linear_cache_indices, + at::Tensor linear_cache_indices, - at::Tensor  - lxu_cache_state, + at::Tensor lxu_cache_state, - at::Tensor  - lxu_cache_weights, + at::Tensor lxu_cache_weights, - int64_t  - time_stamp, + int64_t time_stamp, - at::Tensor  - lru_state, + at::Tensor lru_state, - int64_t  - row_alignment, + int64_t row_alignment, - bool  - gather_cache_stats, + bool gather_cache_stats, - c10::optional< at::Tensor >  - uvm_cache_stats  - - - - ) - + c10::optional< at::Tensor > uvm_cache_stats )

                    @@ -770,103 +642,82 @@

                    void lru_cache_populate_cuda ( - at::Tensor  - weights, + at::Tensor weights, - at::Tensor  - hash_size_cumsum, + at::Tensor hash_size_cumsum, - int64_t  - total_cache_hash_size, + int64_t total_cache_hash_size, - at::Tensor  - cache_index_table_map, + at::Tensor cache_index_table_map, - at::Tensor  - weights_offsets, + at::Tensor weights_offsets, - at::Tensor  - D_offsets, + at::Tensor D_offsets, - at::Tensor  - linear_cache_indices, + at::Tensor linear_cache_indices, - at::Tensor  - lxu_cache_state, + at::Tensor lxu_cache_state, - at::Tensor  - lxu_cache_weights, + at::Tensor lxu_cache_weights, - int64_t  - time_stamp, + int64_t time_stamp, - at::Tensor  - lru_state, + at::Tensor lru_state, - bool  - stochastic_rounding, + bool stochastic_rounding, - bool  - gather_cache_stats, + bool gather_cache_stats, - c10::optional< at::Tensor >  - uvm_cache_stats, + c10::optional< at::Tensor > uvm_cache_stats, - bool  - lock_cache_line, + bool lock_cache_line, - c10::optional< at::Tensor >  - lxu_cache_locking_counter  - - - - ) - + c10::optional< at::Tensor > lxu_cache_locking_counter )

                    @@ -954,25 +791,17 @@

                    void lxu_cache_locations_update_cuda ( - at::Tensor  - lxu_cache_locations, + at::Tensor lxu_cache_locations, - at::Tensor  - lxu_cache_locations_new, + at::Tensor lxu_cache_locations_new, - c10::optional< at::Tensor >  - num_uniq_cache_indices  - - - - ) - + c10::optional< at::Tensor > num_uniq_cache_indices )

                    +
                    + +

                    ◆ reset_weight_momentum_cuda()

                    + +
                    +
                    + + + + + + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                    void reset_weight_momentum_cuda (at::Tensor dev_weights,
                    )at::Tensor uvm_weights,
                    at::Tensor lxu_cache_weights,
                    at::Tensor weights_placements,
                    at::Tensor weights_offsets,
                    at::Tensor momentum1_dev,
                    at::Tensor momentum1_uvm,
                    at::Tensor momentum1_placements,
                    at::Tensor momentum1_offsets,
                    at::Tensor D_offsets,
                    at::Tensor pruned_indices,
                    at::Tensor pruned_indices_offsets,
                    at::Tensor logical_table_ids,
                    at::Tensor buffer_ids,
                    at::Tensor cache_hash_size_cumsum,
                    at::Tensor lxu_cache_state,
                    int64_t total_cache_hash_size )
                    -

                    Lookup the LRU/LFU cache: find the cache weights location for all indices. Look up the slots in the cache corresponding to linear_cache_indices, with a sentinel value for missing.

                    diff --git a/hierarchy.html b/hierarchy.html new file mode 100644 index 000000000..2ba5ff6c3 --- /dev/null +++ b/hierarchy.html @@ -0,0 +1,145 @@ + + + + + + + +fbgemm_gpu: Class Hierarchy + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + +
                    + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + +
                    +
                    Class Hierarchy
                    +
                    +
                    +
                    This inheritance list is sorted roughly, but not completely, alphabetically:
                    +
                    [detail level 12]
                    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                     CBitonicSort< K, V, Dir, Comp >
                     CComparator< T >Warp bitonic K/V sorting code
                     CDefaultPtrTraits< T >
                     Cenable_shared_from_this
                     CEmbeddingRocksDB
                     Cenum_registration< T >
                     CFixedDivisor
                     CFunction
                     CPermutePooledEmbsFunction
                     CPermutePooledEmbsFunctionSplit< permute_pooled_embs_op >
                     CGenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >
                     CGenericPackedTensorAccessor< T, N, PtrTraits, index_t >
                     CGenericPackedTensorAccessorBase< T, 1, PtrTraits, index_t >
                     CGenericPackedTensorAccessor< T, 1, PtrTraits, index_t >
                     CGenericPackedTensorAccessorBase< T, N, DefaultPtrTraits, int64_t >
                     CHalf4
                     CHyperCompressedSparseColumn
                     CInitializer
                     Clog2_calc< x >
                     Clog2_calc_< x >
                     Clog2_calc_< 0 >
                     Crk_state
                     CSharedMemory< T >
                     CSharedMemory< double >
                     CSharedMemory< float >
                     CSharedMemory< int32_t >
                     CSharedMemory< int64_t >
                     CSharedMemory< Vec4T< at::acc_type< double, true > > >
                     CSharedMemory< Vec4T< at::acc_type< float, true > > >
                     CStackArray< T >
                     CStochasticRoundingRNGState
                     CTensorAccessorBase< T, N, PtrTraits, index_t >
                     CTensorAccessor< T, N, PtrTraits, index_t >
                     CTensorAccessorBase< T, 1, PtrTraits, index_t >
                     CTensorAccessor< T, 1, PtrTraits, index_t >
                     CTensorAccessorBase< T, N, DefaultPtrTraits, int64_t >
                     CVec4AccT
                     CVec4StepT< STEP, input_t >
                     CVec4StepT< STEP, at::Half >
                     CVec4StepT< STEP, float >
                     CVec4StepT< STEP, uint8_t >
                     CVec4T< T >
                     CVec4T< at::BFloat16 >
                     CVec4T< at::Half >
                     CVec4T< double >
                     CVec4T< float >
                     CVec4Type< T >
                     CVec4Type< at::Half >
                     CVec4Type< float >
                     CVec4Type< uint8_t >
                     CVecNT< N, PrimitiveType >
                     CVecNT< 1, PrimitiveType::FP >
                     CVecNT< 16, PrimitiveType::INT >
                     CVecNT< 2, PrimitiveType::FP >
                     CVecNT< 4, PrimitiveType::FP >
                     CVecNT< 4, PrimitiveType::INT >
                     CVecNT< 8, PrimitiveType::INT >
                     CWeightRow< emb_t, cache_t, dst_t >
                    +
                    +
                    + + + + diff --git a/histogram__binning__calibration__ops_8cu.html b/histogram__binning__calibration__ops_8cu.html new file mode 100644 index 000000000..18f5f73b5 --- /dev/null +++ b/histogram__binning__calibration__ops_8cu.html @@ -0,0 +1,115 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/histogram_binning_calibration_ops.cu File Reference + + + + + + + + + + + +
                    +
                    + + + + + + +
                    +
                    fbgemm_gpu +
                    +
                    +
                    + + + + + + + + +
                    +
                    + + +
                    +
                    +
                    +
                    +
                    +
                    Loading...
                    +
                    Searching...
                    +
                    No Matches
                    +
                    +
                    +
                    +
                    + + +
                    +
                    + +
                    histogram_binning_calibration_ops.cu File Reference
                    +
                    +
                    +
                    #include <ATen/Dispatch.h>
                    +#include <c10/cuda/CUDAGuard.h>
                    +#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
                    +#include "fbgemm_gpu/sparse_ops.h"
                    +#include "fbgemm_gpu/sparse_ops_utils.h"
                    +
                    + + + +

                    +Namespaces

                    namespace  fbgemm_gpu
                     
                    +

                    Typedef Documentation

                    + +

                    ◆ Tensor

                    + +
                    +
                    + + + + +
                    using Tensor = at::Tensor
                    +
                    + +
                    +
                    +
                    + + + + diff --git a/index.html b/index.html index 4b796d4c3..3e564af26 100644 --- a/index.html +++ b/index.html @@ -6,7 +6,7 @@ - + @@ -28,6 +28,8 @@ + + @@ -251,18 +253,19 @@ -

                    FBGEMM_GPU General Info

                    +

                    FBGEMM_GPU General Info

                    -

                    FBGEMM_GPU Python API

                    +

                    FBGEMM_GPU Python API

                    -

                    FBGEMM_GPU C++ API

                    +

                    FBGEMM_GPU C++ API

                    • Sparse Data Operators
                    • Quantization Operators
                    • @@ -350,11 +353,11 @@
                      -

                      Welcome to FBGEMM’s documentation!

                      +

                      Welcome to FBGEMM’s documentation!

                      This documentation provides a comprehensive reference of the fbgemm_gpu library.

                      -
                      -

                      FBGEMM_GPU General Info

                      + -
                      -

                      FBGEMM_GPU Python API

                      + -
                      -

                      FBGEMM_GPU C++ API

                      +
                      +

                      FBGEMM_GPU C++ API

                      • Sparse Data Operators
                      • Quantization Operators
                          @@ -489,11 +515,9 @@

                          Welcome to FBGEMM’s documentation! - - + - - + diff --git a/input__combine_8cu.html b/input__combine_8cu.html new file mode 100644 index 000000000..6086d5479 --- /dev/null +++ b/input__combine_8cu.html @@ -0,0 +1,113 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/input_combine_ops/input_combine.cu File Reference + + + + + + + + + + + + +
                          + +
                          input_combine.cu File Reference
                          +
                          +
                          +
                          #include <c10/cuda/CUDAGuard.h>
                          +#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
                          +#include "fbgemm_gpu/input_combine.h"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/input__combine_8h.html b/input__combine_8h.html new file mode 100644 index 000000000..187383878 --- /dev/null +++ b/input__combine_8h.html @@ -0,0 +1,96 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/input_combine.h File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          input_combine.h File Reference
                          +
                          +
                          +
                          #include <ATen/ATen.h>
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +
                          + + + + diff --git a/input__combine__cpu_8cpp.html b/input__combine__cpu_8cpp.html new file mode 100644 index 000000000..44cbb52aa --- /dev/null +++ b/input__combine__cpu_8cpp.html @@ -0,0 +1,151 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/input_combine_ops/input_combine_cpu.cpp File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          input_combine_cpu.cpp File Reference
                          +
                          +
                          +
                          #include "fbgemm_gpu/dispatch_macros.h"
                          +#include "fbgemm_gpu/input_combine.h"
                          +#include "fbgemm_gpu/sparse_ops_utils.h"
                          +#include <ATen/ATen.h>
                          +#include <ATen/Context.h>
                          +#include <ATen/Dispatch.h>
                          +#include <ATen/Functions.h>
                          +#include <ATen/TypeDefault.h>
                          +#include <ATen/core/op_registration/op_registration.h>
                          +#include <c10/core/ScalarType.h>
                          +#include <c10/core/TensorOptions.h>
                          +#include <c10/util/Exception.h>
                          +#include <torch/script.h>
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          + + + +

                          +Functions

                          std::tuple< Tensor, Tensor, Tensorpadding_fused_tbe_input_combine_with_length_cpu (const std::vector< Tensor > &indices_list, const std::vector< Tensor > &lengths_list, const std::vector< Tensor > &per_sample_weights, int64_t batch_size)
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ TORCH_LIBRARY_FRAGMENT()

                          + +
                          +
                          + + + + + + + + + + + +
                          TORCH_LIBRARY_FRAGMENT (fbgemm ,
                          m  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/input__combine__gpu_8cpp.html b/input__combine__gpu_8cpp.html new file mode 100644 index 000000000..498dfc015 --- /dev/null +++ b/input__combine__gpu_8cpp.html @@ -0,0 +1,115 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/input_combine_ops/input_combine_gpu.cpp File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          input_combine_gpu.cpp File Reference
                          +
                          +
                          +
                          #include "fbgemm_gpu/input_combine.h"
                          +#include "fbgemm_gpu/sparse_ops_utils.h"
                          +#include <ATen/ATen.h>
                          +#include <ATen/core/op_registration/op_registration.h>
                          +#include <torch/library.h>
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/jagged__dense__bmm__forward_8cu.html b/jagged__dense__bmm__forward_8cu.html new file mode 100644 index 000000000..b39bcf655 --- /dev/null +++ b/jagged__dense__bmm__forward_8cu.html @@ -0,0 +1,138 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_bmm_forward.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          jagged_dense_bmm_forward.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ FBGEMM_OP_DISPATCH()

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_dense_bmm_forward" ,
                          fbgemm_gpu::jagged_dense_bmm_forward_cuda  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/jagged__dense__dense__elementwise__add__jagged__output__forward_8cu.html b/jagged__dense__dense__elementwise__add__jagged__output__forward_8cu.html new file mode 100644 index 000000000..cf46c32de --- /dev/null +++ b/jagged__dense__dense__elementwise__add__jagged__output__forward_8cu.html @@ -0,0 +1,189 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_dense_elementwise_add_jagged_output_forward.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          jagged_dense_dense_elementwise_add_jagged_output_forward.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Macro Definition Documentation

                          + +

                          ◆ INVOKE_KERNEL_WITH_DIM

                          + +
                          +
                          + + + + + + + +
                          #define INVOKE_KERNEL_WITH_DIM( NUM_JAGGED_DIM)
                          +
                          +Value:
                          { \
                          +
                          dim3 threads, blocks; \
                          +
                          StackArray<int64_t> jagged_dims_tensor; \
                          +
                          std::tie(threads, blocks, jagged_dims_tensor) = \
                          +
                          check_shape_and_partition_(x_values, x_offsets, y_0); \
                          +
                          blocks.x = div_round_up(x_values.size(0), threads.y); \
                          +
                          std::vector<Tensor> x_offsets_contig; \
                          +
                          x_offsets_contig.resize(num_jagged_dim); \
                          +
                          StackArray<index_t*> x_offset_ptrs; \
                          +
                          x_offset_ptrs.ndim = num_jagged_dim; \
                          +
                          StackArray<int64_t> x_offset_sizes; \
                          +
                          x_offset_sizes.ndim = num_jagged_dim; \
                          +
                          for (int d = 0; d < num_jagged_dim; ++d) { \
                          +
                          x_offsets_contig[d] = x_offsets[d].contiguous(); \
                          +
                          x_offset_ptrs.vals[d] = \
                          +
                          x_offsets_contig[d].template data_ptr<index_t>(); \
                          +
                          x_offset_sizes.vals[d] = x_offsets[d].numel(); \
                          +
                          } \
                          +
                          jagged_dense_dense_elementwise_jagged_output_kernel_< \
                          +
                          NUM_JAGGED_DIM, \
                          +
                          index_t><<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>( \
                          +
                          x_values.packed_accessor32<scalar_t, 2, at::RestrictPtrTraits>(), \
                          +
                          x_offset_ptrs, \
                          +
                          x_offset_sizes, \
                          +
                          y_0_reshaped.packed_accessor32<scalar_t, 3, at::RestrictPtrTraits>(), \
                          +
                          y_1_reshaped.packed_accessor32<scalar_t, 3, at::RestrictPtrTraits>(), \
                          +
                          output_values.packed_accessor32<scalar_t, 2, at::RestrictPtrTraits>(), \
                          +
                          jagged_dims_tensor, \
                          +
                          f); \
                          +
                          }
                          +
                          Definition sparse_ops_utils.h:446
                          +
                          size_t ndim
                          Definition sparse_ops_utils.h:448
                          +
                          T vals[kStackArrayMaxDims]
                          Definition sparse_ops_utils.h:447
                          +
                          +
                          +
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ FBGEMM_OP_DISPATCH()

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_dense_dense_elementwise_add_jagged_output_forward" ,
                          fbgemm_gpu::jagged_dense_dense_elementwise_add_jagged_output_forward  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/jagged__dense__elementwise__mul__backward_8cu.html b/jagged__dense__elementwise__mul__backward_8cu.html new file mode 100644 index 000000000..f9958533b --- /dev/null +++ b/jagged__dense__elementwise__mul__backward_8cu.html @@ -0,0 +1,188 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_elementwise_mul_backward.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          jagged_dense_elementwise_mul_backward.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          + + + + +

                          +Functions

                          template<int NUM_JAGGED_DIM, typename index_t , typename scalar_t , typename F >
                          __global__ __launch_bounds__ (kMaxThreads) void jagged_jagged_elementwise_dense_output_kernel_(const pta
                           
                          +

                          Macro Definition Documentation

                          + +

                          ◆ INVOKE_KERNEL_WITH_DIM

                          + +
                          +
                          + + + + + + + +
                          #define INVOKE_KERNEL_WITH_DIM( NUM_JAGGED_DIM)
                          +
                          +Value:
                          { \
                          +
                          std::vector<Tensor> x_offsets_contig; \
                          +
                          x_offsets_contig.resize(num_jagged_dim); \
                          +
                          StackArray<index_t*> x_offset_ptrs; \
                          +
                          x_offset_ptrs.ndim = num_jagged_dim; \
                          +
                          for (int d = 0; d < num_jagged_dim; ++d) { \
                          +
                          x_offsets_contig[d] = x_offsets[d].contiguous(); \
                          +
                          x_offset_ptrs.vals[d] = \
                          +
                          x_offsets_contig[d].template data_ptr<index_t>(); \
                          +
                          } \
                          +
                          [[maybe_unused]] const auto func_name = \
                          +
                          "jagged_jagged_elementwise_dense_output_kernel_"; \
                          +
                          jagged_jagged_elementwise_dense_output_kernel_<NUM_JAGGED_DIM, index_t> \
                          +
                          <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>( \
                          +
                          MAKE_PTA_WITH_NAME(func_name, x_values, scalar_t, 2, 32), \
                          +
                          x_offset_ptrs, \
                          +
                          MAKE_PTA_WITH_NAME(func_name, y_values, scalar_t, 2, 32), \
                          +
                          MAKE_PTA_WITH_NAME(func_name, output_reshaped, scalar_t, 3, 32), \
                          +
                          jagged_dims_tensor, \
                          +
                          f, \
                          +
                          padding_value); \
                          +
                          }
                          +
                          Definition sparse_ops_utils.h:446
                          +
                          size_t ndim
                          Definition sparse_ops_utils.h:448
                          +
                          T vals[kStackArrayMaxDims]
                          Definition sparse_ops_utils.h:447
                          +
                          +
                          +
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ FBGEMM_OP_DISPATCH()

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_dense_elementwise_mul_backward" ,
                          fbgemm_gpu::jagged_dense_elementwise_mul_backward  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/jagged__dense__elementwise__mul__forward_8cu.html b/jagged__dense__elementwise__mul__forward_8cu.html new file mode 100644 index 000000000..691220c66 --- /dev/null +++ b/jagged__dense__elementwise__mul__forward_8cu.html @@ -0,0 +1,138 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/jagged_dense_elementwise_mul_forward.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          jagged_dense_elementwise_mul_forward.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ FBGEMM_OP_DISPATCH()

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_dense_elementwise_mul_forward" ,
                          fbgemm_gpu::jagged_dense_elementwise_mul_forward  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/jagged__index__add__2d__forward_8cu.html b/jagged__index__add__2d__forward_8cu.html new file mode 100644 index 000000000..0f8cca4c2 --- /dev/null +++ b/jagged__index__add__2d__forward_8cu.html @@ -0,0 +1,144 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/jagged_index_add_2d_forward.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          jagged_index_add_2d_forward.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          + + + +

                          +Functions

                          Tensor jagged_index_add_2d_forward_cuda (const Tensor &values, const Tensor &indices, const Tensor &input_offsets, const Tensor &output_offsets, const int64_t num_dense_input_rows, const int64_t num_output_rows)
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ FBGEMM_OP_DISPATCH()

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_index_add_2d_forward" ,
                          fbgemm_gpu::jagged_index_add_2d_forward_cuda  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/jagged__index__select__2d__forward_8cu.html b/jagged__index__select__2d__forward_8cu.html new file mode 100644 index 000000000..ff06852e1 --- /dev/null +++ b/jagged__index__select__2d__forward_8cu.html @@ -0,0 +1,144 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/jagged_index_select_2d_forward.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          jagged_index_select_2d_forward.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          + + + +

                          +Functions

                          Tensor jagged_index_select_2d_forward_cuda (const Tensor &values, const Tensor &indices, const Tensor &input_offsets, const Tensor &output_offsets, const int64_t num_dense_output_rows)
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ FBGEMM_OP_DISPATCH()

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_index_select_2d_forward" ,
                          fbgemm_gpu::jagged_index_select_2d_forward_cuda  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/jagged__jagged__bmm__forward_8cu.html b/jagged__jagged__bmm__forward_8cu.html new file mode 100644 index 000000000..9b2b1ba00 --- /dev/null +++ b/jagged__jagged__bmm__forward_8cu.html @@ -0,0 +1,138 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/jagged_jagged_bmm_forward.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          jagged_jagged_bmm_forward.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ FBGEMM_OP_DISPATCH()

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_jagged_bmm_forward" ,
                          fbgemm_gpu::jagged_jagged_bmm_forward_cuda  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/jagged__softmax__backward_8cu.html b/jagged__softmax__backward_8cu.html new file mode 100644 index 000000000..8d3f75f00 --- /dev/null +++ b/jagged__softmax__backward_8cu.html @@ -0,0 +1,138 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/jagged_softmax_backward.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          jagged_softmax_backward.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ FBGEMM_OP_DISPATCH()

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_softmax_backward" ,
                          fbgemm_gpu::jagged_softmax_backward_cuda  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/jagged__softmax__forward_8cu.html b/jagged__softmax__forward_8cu.html new file mode 100644 index 000000000..289566815 --- /dev/null +++ b/jagged__softmax__forward_8cu.html @@ -0,0 +1,138 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/jagged_softmax_forward.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          jagged_softmax_forward.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ FBGEMM_OP_DISPATCH()

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_softmax_forward" ,
                          fbgemm_gpu::jagged_softmax_forward_cuda  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/jagged__tensor__ops_2common_8cuh.html b/jagged__tensor__ops_2common_8cuh.html new file mode 100644 index 000000000..d69d14e8b --- /dev/null +++ b/jagged__tensor__ops_2common_8cuh.html @@ -0,0 +1,244 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/common.cuh File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          common.cuh File Reference
                          +
                          +
                          +
                          #include <ATen/ATen.h>
                          +#include <ATen/Dispatch.h>
                          +#include <ATen/cuda/CUDAContext.h>
                          +#include <ATen/cuda/Exceptions.h>
                          +#include <c10/cuda/CUDAGuard.h>
                          +#include <torch/csrc/autograd/custom_function.h>
                          +#include <torch/library.h>
                          +#include <ATen/cuda/Atomic.cuh>
                          +#include <cub/cub.cuh>
                          +#include "fbgemm_gpu/cub_namespace_prefix.cuh"
                          +#include <cub/device/device_scan.cuh>
                          +#include "fbgemm_gpu/cub_namespace_postfix.cuh"
                          +#include "fbgemm_gpu/dispatch_macros.h"
                          +#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
                          +#include "fbgemm_gpu/fbgemm_tensor_accessor.h"
                          +#include "fbgemm_gpu/ops_utils.h"
                          +#include "fbgemm_gpu/sparse_ops.h"
                          +#include "fbgemm_gpu/sparse_ops_utils.h"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Macro Definition Documentation

                          + +

                          ◆ INVOKE_KERNEL_WITH_DIM [1/2]

                          + +
                          +
                          + + + + + + + +
                          #define INVOKE_KERNEL_WITH_DIM( NUM_JAGGED_DIM)
                          +
                          +Value:
                          { \
                          +
                          std::vector<Tensor> x_offsets_contig; \
                          +
                          x_offsets_contig.resize(num_jagged_dim); \
                          +
                          StackArray<index_t*> x_offset_ptrs; \
                          +
                          x_offset_ptrs.ndim = num_jagged_dim; \
                          +
                          for (int d = 0; d < num_jagged_dim; ++d) { \
                          +
                          x_offsets_contig[d] = x_offsets[d].contiguous(); \
                          +
                          x_offset_ptrs.vals[d] = \
                          +
                          x_offsets_contig[d].template data_ptr<index_t>(); \
                          +
                          } \
                          +
                          jagged_dense_elementwise_dense_output_kernel_<NUM_JAGGED_DIM, index_t> \
                          +
                          <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>( \
                          +
                          x_values.packed_accessor32<scalar_t, 2, at::RestrictPtrTraits>(), \
                          +
                          x_offset_ptrs, \
                          +
                          y_reshaped \
                          +
                          .packed_accessor32<scalar_t, 3, at::RestrictPtrTraits>(), \
                          +
                          output_reshaped \
                          +
                          .packed_accessor32<scalar_t, 3, at::RestrictPtrTraits>(), \
                          +
                          jagged_dims_tensor, \
                          +
                          f, \
                          +
                          padding_value); \
                          +
                          }
                          +
                          Definition sparse_ops_utils.h:446
                          +
                          size_t ndim
                          Definition sparse_ops_utils.h:448
                          +
                          T vals[kStackArrayMaxDims]
                          Definition sparse_ops_utils.h:447
                          +
                          +
                          +
                          + +

                          ◆ INVOKE_KERNEL_WITH_DIM [2/2]

                          + +
                          +
                          + + + + + + + +
                          #define INVOKE_KERNEL_WITH_DIM( NUM_JAGGED_DIM)
                          +
                          + +
                          +
                          +

                          Variable Documentation

                          + +

                          ◆ []

                          + +
                          +
                          + + + + +
                          union { ... } ::VecType32::Data data
                          +
                          + +
                          +
                          + +

                          ◆ half4

                          + +
                          +
                          + + + + +
                          half4
                          +
                          + +
                          +
                          + +

                          ◆ half8

                          + +
                          +
                          + + + + +
                          half8
                          +
                          + +
                          +
                          + +

                          ◆ mask

                          + +
                          +
                          + + + + +
                          TType mask
                          +
                          + +
                          +
                          + +

                          ◆ val

                          + +
                          +
                          + + + + +
                          __half2 val
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/jagged__tensor__ops_8cu.html b/jagged__tensor__ops_8cu.html new file mode 100644 index 000000000..487bcdadf --- /dev/null +++ b/jagged__tensor__ops_8cu.html @@ -0,0 +1,375 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          +
                          jagged_tensor_ops.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +

                          Function Documentation

                          + +

                          ◆ FBGEMM_OP_DISPATCH() [1/11]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "batched_dense_vec_jagged_2d_mul" ,
                          fbgemm_gpu::batched_dense_vec_jagged_2d_mul  )
                          +
                          + +
                          +
                          + +

                          ◆ FBGEMM_OP_DISPATCH() [2/11]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "dense_to_jagged" ,
                          fbgemm_gpu::dense_to_jagged  )
                          +
                          + +
                          +
                          + +

                          ◆ FBGEMM_OP_DISPATCH() [3/11]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_1d_to_dense" ,
                          fbgemm_gpu::jagged_1d_to_dense  )
                          +
                          + +
                          +
                          + +

                          ◆ FBGEMM_OP_DISPATCH() [4/11]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_2d_to_dense" ,
                          fbgemm_gpu::jagged_2d_to_dense  )
                          +
                          + +
                          +
                          + +

                          ◆ FBGEMM_OP_DISPATCH() [5/11]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_dense_bmm" ,
                          fbgemm_gpu::jagged_dense_bmm  )
                          +
                          + +
                          +
                          + +

                          ◆ FBGEMM_OP_DISPATCH() [6/11]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_dense_dense_elementwise_add_jagged_output" ,
                          fbgemm_gpu::jagged_dense_dense_elementwise_add_jagged_output  )
                          +
                          + +
                          +
                          + +

                          ◆ FBGEMM_OP_DISPATCH() [7/11]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_dense_elementwise_add" ,
                          fbgemm_gpu::jagged_dense_elementwise_add  )
                          +
                          + +
                          +
                          + +

                          ◆ FBGEMM_OP_DISPATCH() [8/11]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_dense_elementwise_mul" ,
                          fbgemm_gpu::jagged_dense_elementwise_mul  )
                          +
                          + +
                          +
                          + +

                          ◆ FBGEMM_OP_DISPATCH() [9/11]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_jagged_bmm" ,
                          fbgemm_gpu::jagged_jagged_bmm  )
                          +
                          + +
                          +
                          + +

                          ◆ FBGEMM_OP_DISPATCH() [10/11]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_softmax" ,
                          fbgemm_gpu::jagged_softmax  )
                          +
                          + +
                          +
                          + +

                          ◆ FBGEMM_OP_DISPATCH() [11/11]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_to_padded_dense" ,
                          fbgemm_gpu::jagged_to_padded_dense  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/jagged__tensor__ops__autograd_8cpp.html b/jagged__tensor__ops__autograd_8cpp.html new file mode 100644 index 000000000..f6bf1fe2a --- /dev/null +++ b/jagged__tensor__ops__autograd_8cpp.html @@ -0,0 +1,168 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_autograd.cpp File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          jagged_tensor_ops_autograd.cpp File Reference
                          +
                          +
                          +
                          #include <ATen/ATen.h>
                          +#include <ATen/AccumulateType.h>
                          +#include <ATen/TensorUtils.h>
                          +#include <ATen/core/dispatch/Dispatcher.h>
                          +#include <c10/core/SymIntArrayRef.h>
                          +#include <torch/csrc/autograd/custom_function.h>
                          +#include <torch/library.h>
                          +#include <torch/torch.h>
                          +#include "fbgemm_gpu/sparse_ops.h"
                          +#include "fbgemm_gpu/sparse_ops_utils.h"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          + + + + + + + +

                          +Functions

                          Tensor jagged_dense_elementwise_add (const Tensor &x_values, const std::vector< Tensor > &x_offsets, const Tensor &y)
                           
                          std::tuple< Tensor, std::vector< Tensor > > jagged_dense_elementwise_add_jagged_output (const Tensor &x_values, const std::vector< Tensor > &x_offsets, const Tensor &y)
                           
                          std::vector< Tensorjagged_index_select_2d (const Tensor &values, const Tensor &lengths, const Tensor &indices)
                           
                          +

                          Function Documentation

                          + +

                          ◆ TORCH_LIBRARY_IMPL() [1/2]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          TORCH_LIBRARY_IMPL (fbgemm ,
                          Autograd ,
                          m  )
                          +
                          + +
                          +
                          + +

                          ◆ TORCH_LIBRARY_IMPL() [2/2]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          TORCH_LIBRARY_IMPL (fbgemm ,
                          CompositeImplicitAutograd ,
                          m  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/jagged__tensor__ops__cpu_8cpp.html b/jagged__tensor__ops__cpu_8cpp.html new file mode 100644 index 000000000..d648a2fdd --- /dev/null +++ b/jagged__tensor__ops__cpu_8cpp.html @@ -0,0 +1,276 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_cpu.cpp File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          jagged_tensor_ops_cpu.cpp File Reference
                          +
                          +
                          +
                          #include <ATen/ATen.h>
                          +#include <ATen/AccumulateType.h>
                          +#include <ATen/core/dispatch/Dispatcher.h>
                          +#include <torch/csrc/autograd/custom_function.h>
                          +#include <torch/library.h>
                          +#include "ATen/Parallel.h"
                          +#include "fbgemm_gpu/dispatch_macros.h"
                          +#include "fbgemm_gpu/sparse_ops.h"
                          +#include "fbgemm_gpu/sparse_ops_utils.h"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          + + + + + + + +

                          +Functions

                          Tensor jagged_index_select_2d_forward_cpu (const Tensor &values, const Tensor &indices, const Tensor &input_offsets, const Tensor &output_offsets, const int64_t num_dense_output_rows)
                           
                          Tensor jagged_index_add_2d_forward_cpu (const Tensor &values, const Tensor &indices, const Tensor &input_offsets, const Tensor &output_offsets, const int64_t num_dense_input_rows, const int64_t num_output_rows)
                           
                          Tensor jagged_slice_forward_cpu (const Tensor &x_values, const Tensor &x_lengths, const Tensor &src_start, const Tensor &output_lengths, const Tensor &tgt_start, const int64_t num_output_rows, const int64_t slice_length, const bool fill_zeros)
                           
                          +

                          Macro Definition Documentation

                          + +

                          ◆ INVOKE_KERNEL_WITH_DIM [1/3]

                          + +
                          +
                          + + + + + + + +
                          #define INVOKE_KERNEL_WITH_DIM( NUM_JAGGED_DIM)
                          +
                          +Value:
                          if (y.size(-1) == 1) { \
                          +
                          jagged_dense_elementwise_dense_output_kernel_< \
                          +
                          NUM_JAGGED_DIM, \
                          +
                          true, \
                          +
                          index_t>(x_values, x_offsets, y, output, f, padding_value); \
                          +
                          } else { \
                          +
                          jagged_dense_elementwise_dense_output_kernel_< \
                          +
                          NUM_JAGGED_DIM, \
                          +
                          false, \
                          +
                          index_t>(x_values, x_offsets, y, output, f, padding_value); \
                          +
                          }
                          +
                          template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const int32_t const bool pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > output
                          Definition gen_batch_index_select_dim0_forward_kernel_small.cu:128
                          +
                          +
                          +
                          + +

                          ◆ INVOKE_KERNEL_WITH_DIM [2/3]

                          + +
                          +
                          + + + + + + + +
                          #define INVOKE_KERNEL_WITH_DIM( NUM_JAGGED_DIM)
                          +
                          +Value:
                          if (y.size(-1) == 1) { \
                          +
                          jagged_dense_elementwise_jagged_output_kernel_< \
                          +
                          NUM_JAGGED_DIM, \
                          +
                          true, \
                          +
                          index_t, \
                          +
                          scalar_t>(x_values, x_offsets, y, output_values, f); \
                          +
                          } else { \
                          +
                          jagged_dense_elementwise_jagged_output_kernel_< \
                          +
                          NUM_JAGGED_DIM, \
                          +
                          false, \
                          +
                          index_t, \
                          +
                          scalar_t>(x_values, x_offsets, y, output_values, f); \
                          +
                          }
                          +
                          +
                          +
                          + +

                          ◆ INVOKE_KERNEL_WITH_DIM [3/3]

                          + +
                          +
                          + + + + + + + +
                          #define INVOKE_KERNEL_WITH_DIM( NUM_JAGGED_DIM)
                          +
                          +Value:
                          if (output.size(-1) == 1) { \
                          +
                          jagged_jagged_elementwise_dense_output_kernel_< \
                          +
                          NUM_JAGGED_DIM, \
                          +
                          true, \
                          +
                          index_t>(x_values, x_offsets, y_values, output, f, padding_value); \
                          +
                          } else { \
                          +
                          jagged_jagged_elementwise_dense_output_kernel_< \
                          +
                          NUM_JAGGED_DIM, \
                          +
                          false, \
                          +
                          index_t>(x_values, x_offsets, y_values, output, f, padding_value); \
                          +
                          }
                          +
                          +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ TORCH_LIBRARY_FRAGMENT()

                          + +
                          +
                          + + + + + + + + + + + +
                          TORCH_LIBRARY_FRAGMENT (fbgemm ,
                          m  )
                          +
                          + +
                          +
                          + +

                          ◆ TORCH_LIBRARY_IMPL() [1/2]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          TORCH_LIBRARY_IMPL (fbgemm ,
                          CompositeExplicitAutograd ,
                          m  )
                          +
                          + +
                          +
                          + +

                          ◆ TORCH_LIBRARY_IMPL() [2/2]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          TORCH_LIBRARY_IMPL (fbgemm ,
                          CPU ,
                          m  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/jagged__tensor__ops__meta_8cpp.html b/jagged__tensor__ops__meta_8cpp.html new file mode 100644 index 000000000..5c9d697c7 --- /dev/null +++ b/jagged__tensor__ops__meta_8cpp.html @@ -0,0 +1,128 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/jagged_tensor_ops_meta.cpp File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          jagged_tensor_ops_meta.cpp File Reference
                          +
                          +
                          +
                          #include <ATen/ATen.h>
                          +#include <ATen/AccumulateType.h>
                          +#include <torch/csrc/autograd/custom_function.h>
                          +#include <torch/library.h>
                          +#include "fbgemm_gpu/sparse_ops.h"
                          +#include "fbgemm_gpu/sparse_ops_utils.h"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Function Documentation

                          + +

                          ◆ TORCH_LIBRARY_IMPL()

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          TORCH_LIBRARY_IMPL (fbgemm ,
                          Meta ,
                          m  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/jagged__to__padded__dense__backward_8cu.html b/jagged__to__padded__dense__backward_8cu.html new file mode 100644 index 000000000..38e364590 --- /dev/null +++ b/jagged__to__padded__dense__backward_8cu.html @@ -0,0 +1,138 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/jagged_to_padded_dense_backward.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          jagged_to_padded_dense_backward.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ FBGEMM_OP_DISPATCH()

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_to_padded_dense_backward" ,
                          fbgemm_gpu::jagged_to_padded_dense_backward  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/jagged__to__padded__dense__forward_8cu.html b/jagged__to__padded__dense__forward_8cu.html new file mode 100644 index 000000000..5e0fe9d5f --- /dev/null +++ b/jagged__to__padded__dense__forward_8cu.html @@ -0,0 +1,277 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/jagged_to_padded_dense_forward.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          jagged_to_padded_dense_forward.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +#include "fbgemm_gpu/cub_namespace_postfix.cuh"
                          +#include "fbgemm_gpu/cub_namespace_prefix.cuh"
                          +#include <cub/device/device_scan.cuh>
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          + + + +

                          +Functions

                          std::tuple< Tensor, std::vector< Tensor > > jagged_dense_elementwise_add_jagged_output_cuda (const Tensor &x_values, const std::vector< Tensor > &x_offsets, const Tensor &y)
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ FBGEMM_OP_DISPATCH() [1/6]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_dense_elementwise_add_jagged_output" ,
                          fbgemm_gpu::jagged_dense_elementwise_add_jagged_output_cuda  )
                          +
                          + +
                          +
                          + +

                          ◆ FBGEMM_OP_DISPATCH() [2/6]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_to_padded_dense_forward" ,
                          fbgemm_gpu::jagged_to_padded_dense_forward  )
                          +
                          + +
                          +
                          + +

                          ◆ FBGEMM_OP_DISPATCH() [3/6]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "stacked_jagged_1d_to_dense" ,
                          fbgemm_gpu::stacked_jagged_1d_to_dense_gpu  )
                          +
                          + +
                          +
                          + +

                          ◆ FBGEMM_OP_DISPATCH() [4/6]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "stacked_jagged_2d_to_dense" ,
                          fbgemm_gpu::stacked_jagged_2d_to_dense_gpu  )
                          +
                          + +
                          +
                          + +

                          ◆ FBGEMM_OP_DISPATCH() [5/6]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "stacked_jagged_2d_to_dense_backward" ,
                          fbgemm_gpu::stacked_jagged_2d_to_dense_backward_cuda  )
                          +
                          + +
                          +
                          + +

                          ◆ FBGEMM_OP_DISPATCH() [6/6]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "stacked_jagged_2d_to_dense_forward" ,
                          fbgemm_gpu::stacked_jagged_2d_to_dense_forward_cuda  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/jagged__unique__indices_8cu.html b/jagged__unique__indices_8cu.html new file mode 100644 index 000000000..1efb567b3 --- /dev/null +++ b/jagged__unique__indices_8cu.html @@ -0,0 +1,164 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/jagged_unique_indices.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          jagged_unique_indices.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ FBGEMM_OP_DISPATCH() [1/2]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_hash_size_cumsum" ,
                          fbgemm_gpu::jagged_hash_size_cumsum_cuda  )
                          +
                          + +
                          +
                          + +

                          ◆ FBGEMM_OP_DISPATCH() [2/2]

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "jagged_unique_indices" ,
                          fbgemm_gpu::jagged_unique_indices_cuda  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/keyed__jagged__index__select__dim1_8cu.html b/keyed__jagged__index__select__dim1_8cu.html new file mode 100644 index 000000000..58d50bfef --- /dev/null +++ b/keyed__jagged__index__select__dim1_8cu.html @@ -0,0 +1,194 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/keyed_jagged_index_select_dim1.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          keyed_jagged_index_select_dim1.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Macro Definition Documentation

                          + +

                          ◆ LAUNCH_KERNEL

                          + +
                          +
                          + + + + + + + + + + + + + + + + + + + + + +
                          #define LAUNCH_KERNEL( WEIGHTED,
                          WEIGHT_TYPE,
                          OUTPUT_WEIGHTS,
                          WEIGHTS )
                          +
                          +Value:
                          { \
                          +
                          keyed_jagged_index_select_dim1_kernel< \
                          +
                          value_t, \
                          +
                          index_t, \
                          +
                          offset_t, \
                          +
                          WEIGHT_TYPE, \
                          +
                          WEIGHTED> \
                          +
                          <<<grid_size, kMaxThreads, 0, at::cuda::getCurrentCUDAStream()>>>( \
                          +
                          output.packed_accessor64<value_t, 1, at::RestrictPtrTraits>(), \
                          +
                          OUTPUT_WEIGHTS \
                          +
                          .packed_accessor64<WEIGHT_TYPE, 1, at::RestrictPtrTraits>(), \
                          +
                          values.packed_accessor64<value_t, 1, at::RestrictPtrTraits>(), \
                          +
                          WEIGHTS \
                          +
                          .packed_accessor64<WEIGHT_TYPE, 1, at::RestrictPtrTraits>(), \
                          +
                          offsets.packed_accessor32<offset_t, 1, at::RestrictPtrTraits>(), \
                          +
                          indices.packed_accessor32<index_t, 1, at::RestrictPtrTraits>(), \
                          +
                          output_offsets_contig \
                          +
                          ->packed_accessor32<offset_t, 1, at::RestrictPtrTraits>(), \
                          +
                          num_batches, \
                          +
                          batch_size); \
                          +
                          }
                          +
                          template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const int32_t const bool pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > output
                          Definition gen_batch_index_select_dim0_forward_kernel_small.cu:128
                          +
                          template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices
                          Definition gen_batch_index_select_dim0_forward_kernel_small.cu:123
                          +
                          template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > int64_t FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets
                          Definition gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu:104
                          +
                          +
                          +
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ FBGEMM_OP_DISPATCH()

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          FBGEMM_OP_DISPATCH (CUDA ,
                          "keyed_jagged_index_select_dim1" ,
                          fbgemm_gpu::keyed_jagged_index_select_dim_1_gpu  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/layout__transform__ops_8cu.html b/layout__transform__ops_8cu.html new file mode 100644 index 000000000..a750b50e2 --- /dev/null +++ b/layout__transform__ops_8cu.html @@ -0,0 +1,124 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/layout_transform_ops/layout_transform_ops.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          layout_transform_ops.cu File Reference
                          +
                          +
                          +
                          #include "fbgemm_gpu/cub_namespace_prefix.cuh"
                          +#include <cub/device/device_scan.cuh>
                          +#include "fbgemm_gpu/dispatch_macros.h"
                          +#include "fbgemm_gpu/cub_namespace_postfix.cuh"
                          +#include <ATen/ATen.h>
                          +#include <ATen/core/op_registration/op_registration.h>
                          +#include <ATen/cuda/CUDAContext.h>
                          +#include <ATen/cuda/Exceptions.h>
                          +#include <c10/cuda/CUDAGuard.h>
                          +#include <torch/library.h>
                          +#include "ATen/Parallel.h"
                          +#include "fbgemm_gpu/layout_transform_ops.cuh"
                          +#include "fbgemm_gpu/sparse_ops.h"
                          +#include "fbgemm_gpu/sparse_ops_utils.h"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/layout__transform__ops_8cuh.html b/layout__transform__ops_8cuh.html new file mode 100644 index 000000000..2e7c04572 --- /dev/null +++ b/layout__transform__ops_8cuh.html @@ -0,0 +1,191 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/layout_transform_ops.cuh File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          +
                          layout_transform_ops.cuh File Reference
                          +
                          +
                          +
                          #include <cuda.h>
                          +#include "./fbgemm_cuda_utils.cuh"
                          +

                          Function Documentation

                          + +

                          ◆ permute_pooled_embs_kernel()

                          + +
                          +
                          +
                          +template<typename scalar_t >
                          + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                          __global__ void permute_pooled_embs_kernel (const scalar_t *__restrict__ go,
                          const int64_t *__restrict__ offset_dim_list,
                          const int64_t *__restrict__ permute_list,
                          const int64_t *__restrict__ inv_offset_dim_list,
                          scalar_t *__restrict__ sgo,
                          const int64_t B,
                          const int64_t T,
                          const int64_t dim_sum )
                          +
                          + +
                          +
                          + +

                          ◆ recat_copy_async_kernel()

                          + +
                          +
                          +
                          +template<typename scalar_t >
                          + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                          __global__ void recat_copy_async_kernel (const int64_t *__restrict__ dim_sum_per_rank,
                          const int64_t *__restrict__ cum_dim_sum_per_rank,
                          const scalar_t *__restrict__ go,
                          scalar_t *__restrict__ sgo,
                          const int64_t T,
                          const int64_t B,
                          const int64_t dim_sum )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/layout__transform__ops__cpu_8cpp.html b/layout__transform__ops__cpu_8cpp.html new file mode 100644 index 000000000..12cf69d4e --- /dev/null +++ b/layout__transform__ops__cpu_8cpp.html @@ -0,0 +1,164 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/layout_transform_ops/layout_transform_ops_cpu.cpp File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          layout_transform_ops_cpu.cpp File Reference
                          +
                          +
                          +
                          #include <ATen/ATen.h>
                          +#include <ATen/core/op_registration/op_registration.h>
                          +#include <torch/library.h>
                          +#include "ATen/Parallel.h"
                          +#include "fbgemm_gpu/dispatch_macros.h"
                          +#include "fbgemm_gpu/sparse_ops_utils.h"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ TORCH_LIBRARY_FRAGMENT()

                          + +
                          +
                          + + + + + + + + + + + +
                          TORCH_LIBRARY_FRAGMENT (fbgemm ,
                          m  )
                          +
                          + +
                          +
                          + +

                          ◆ TORCH_LIBRARY_IMPL()

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          TORCH_LIBRARY_IMPL (fbgemm ,
                          CPU ,
                          m  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/layout__transform__ops__gpu_8cpp.html b/layout__transform__ops__gpu_8cpp.html new file mode 100644 index 000000000..6a16e62b0 --- /dev/null +++ b/layout__transform__ops__gpu_8cpp.html @@ -0,0 +1,119 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/layout_transform_ops/layout_transform_ops_gpu.cpp File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          +
                          layout_transform_ops_gpu.cpp File Reference
                          +
                          +
                          +
                          #include <ATen/ATen.h>
                          +#include <ATen/core/op_registration/op_registration.h>
                          +#include <torch/library.h>
                          +#include "fbgemm_gpu/sparse_ops.h"
                          +#include "fbgemm_gpu/sparse_ops_utils.h"
                          +

                          Function Documentation

                          + +

                          ◆ TORCH_LIBRARY_IMPL()

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          TORCH_LIBRARY_IMPL (fbgemm ,
                          CUDA ,
                          m  )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/lfu__cache__find_8cu.html b/lfu__cache__find_8cu.html new file mode 100644 index 000000000..90a9b2a0b --- /dev/null +++ b/lfu__cache__find_8cu.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_find.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          lfu_cache_find.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/lfu__cache__populate_8cu.html b/lfu__cache__populate_8cu.html new file mode 100644 index 000000000..5a6401060 --- /dev/null +++ b/lfu__cache__populate_8cu.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          lfu_cache_populate.cu File Reference
                          +
                          + + + + + diff --git a/lfu__cache__populate__byte_8cpp.html b/lfu__cache__populate__byte_8cpp.html new file mode 100644 index 000000000..053068a9f --- /dev/null +++ b/lfu__cache__populate__byte_8cpp.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cpp File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          lfu_cache_populate_byte.cpp File Reference
                          +
                          +
                          +
                          #include "common.h"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/lfu__cache__populate__byte_8cu.html b/lfu__cache__populate__byte_8cu.html new file mode 100644 index 000000000..7ecc43285 --- /dev/null +++ b/lfu__cache__populate__byte_8cu.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_cache/lfu_cache_populate_byte.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          lfu_cache_populate_byte.cu File Reference
                          +
                          + + + + + diff --git a/linearize__cache__indices_8cpp.html b/linearize__cache__indices_8cpp.html new file mode 100644 index 000000000..7858d69fa --- /dev/null +++ b/linearize__cache__indices_8cpp.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cpp File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          linearize_cache_indices.cpp File Reference
                          +
                          +
                          +
                          #include "common.h"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/linearize__cache__indices_8cu.html b/linearize__cache__indices_8cu.html new file mode 100644 index 000000000..ef506c7e9 --- /dev/null +++ b/linearize__cache__indices_8cu.html @@ -0,0 +1,115 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_cache/linearize_cache_indices.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          linearize_cache_indices.cu File Reference
                          +
                          + + + + + diff --git a/lru__cache__find_8cu.html b/lru__cache__find_8cu.html new file mode 100644 index 000000000..23449bc14 --- /dev/null +++ b/lru__cache__find_8cu.html @@ -0,0 +1,143 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_cache/lru_cache_find.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          lru_cache_find.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +
                          + + + +

                          +Functions

                          DLL_PUBLIC std::pair< Tensor, Tensorlru_cache_find_uncached_cuda (Tensor unique_indices, Tensor unique_indices_length, int64_t max_indices, Tensor lxu_cache_state, int64_t time_stamp, Tensor lru_state, bool gather_cache_stats, Tensor uvm_cache_stats, bool lock_cache_line, Tensor lxu_cache_locking_counter)
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ emulate_cache_miss()

                          + +
                          +
                          + + + + + + + + + + + + + + + + + + + + + +
                          DLL_PUBLIC Tensor emulate_cache_miss (Tensor lxu_cache_locations,
                          const int64_t enforced_misses_per_256,
                          const bool gather_cache_stats,
                          Tensor uvm_cache_stats )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/lru__cache__populate_8cu.html b/lru__cache__populate_8cu.html new file mode 100644 index 000000000..740c24681 --- /dev/null +++ b/lru__cache__populate_8cu.html @@ -0,0 +1,195 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          +
                          lru_cache_populate.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ lru_cache_populate_cuda()

                          + +
                          +
                          + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                          DLL_PUBLIC void lru_cache_populate_cuda (Tensor weights,
                          Tensor cache_hash_size_cumsum,
                          const int64_t total_cache_hash_size,
                          Tensor cache_index_table_map,
                          Tensor weights_offsets,
                          Tensor D_offsets,
                          Tensor linear_cache_indices,
                          Tensor lxu_cache_state,
                          Tensor lxu_cache_weights,
                          const int64_t time_stamp,
                          Tensor lru_state,
                          const bool stochastic_rounding,
                          bool gather_cache_stats,
                          c10::optional< Tensor > uvm_cache_stats,
                          bool lock_cache_line,
                          c10::optional< Tensor > lxu_cache_locking_counter )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/lru__cache__populate__byte_8cpp.html b/lru__cache__populate__byte_8cpp.html new file mode 100644 index 000000000..a03028e76 --- /dev/null +++ b/lru__cache__populate__byte_8cpp.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cpp File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          lru_cache_populate_byte.cpp File Reference
                          +
                          +
                          +
                          #include "common.h"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/lru__cache__populate__byte_8cu.html b/lru__cache__populate__byte_8cu.html new file mode 100644 index 000000000..982234513 --- /dev/null +++ b/lru__cache__populate__byte_8cu.html @@ -0,0 +1,281 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_cache/lru_cache_populate_byte.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          +
                          lru_cache_populate_byte.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ direct_mapped_lru_cache_populate_byte_cuda()

                          + +
                          +
                          + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                          DLL_PUBLIC void direct_mapped_lru_cache_populate_byte_cuda (Tensor weights,
                          Tensor cache_hash_size_cumsum,
                          int64_t total_cache_hash_size,
                          Tensor cache_index_table_map,
                          Tensor weights_offsets,
                          Tensor weights_tys,
                          Tensor D_offsets,
                          Tensor linear_cache_indices,
                          Tensor lxu_cache_state,
                          Tensor lxu_cache_weights,
                          int64_t time_stamp,
                          Tensor lru_state,
                          Tensor lxu_cache_miss_timestamp,
                          int64_t row_alignment,
                          bool gather_cache_stats,
                          c10::optional< Tensor > uvm_cache_stats )
                          +
                          + +
                          +
                          + +

                          ◆ lru_cache_populate_byte_cuda()

                          + +
                          +
                          + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                          DLL_PUBLIC void lru_cache_populate_byte_cuda (Tensor weights,
                          Tensor cache_hash_size_cumsum,
                          int64_t total_cache_hash_size,
                          Tensor cache_index_table_map,
                          Tensor weights_offsets,
                          Tensor weights_tys,
                          Tensor D_offsets,
                          Tensor linear_cache_indices,
                          Tensor lxu_cache_state,
                          Tensor lxu_cache_weights,
                          int64_t time_stamp,
                          Tensor lru_state,
                          int64_t row_alignment,
                          bool gather_cache_stats,
                          c10::optional< Tensor > uvm_cache_stats )
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/lxu__cache_8cpp.html b/lxu__cache_8cpp.html new file mode 100644 index 000000000..628e0d646 --- /dev/null +++ b/lxu__cache_8cpp.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cpp File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          lxu_cache.cpp File Reference
                          +
                          +
                          +
                          #include "common.h"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/lxu__cache_8cu.html b/lxu__cache_8cu.html new file mode 100644 index 000000000..8c5f69a23 --- /dev/null +++ b/lxu__cache_8cu.html @@ -0,0 +1,240 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_cache/lxu_cache.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          lxu_cache.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +
                          + + + + + + + + + +

                          +Functions

                          DLL_PUBLIC int64_t host_lxu_cache_slot (int64_t h_in, int64_t C)
                           
                          DLL_PUBLIC void lxu_cache_flush_cuda (Tensor uvm_weights, Tensor cache_hash_size_cumsum, Tensor cache_index_table_map, Tensor weights_offsets, Tensor D_offsets, int64_t total_D, Tensor lxu_cache_state, Tensor lxu_cache_weights, bool stochastic_rounding)
                           
                          void lxu_cache_locking_counter_decrement_cuda (at::Tensor lxu_cache_locking_counter, at::Tensor lxu_cache_locations)
                           
                          DLL_PUBLIC Tensor lxu_cache_lookup_cuda (const Tensor linear_cache_indices, const Tensor lxu_cache_state, const int64_t invalid_index, const bool gather_cache_stats, const c10::optional< Tensor > uvm_cache_stats, const c10::optional< Tensor > num_uniq_cache_indices, const c10::optional< Tensor > lxu_cache_locations_output)
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +

                          Function Documentation

                          + +

                          ◆ direct_mapped_lxu_cache_lookup_cuda()

                          + +
                          +
                          + + + + + + + + + + + + + + + + + + + + + + + + + + +
                          DLL_PUBLIC Tensor direct_mapped_lxu_cache_lookup_cuda (Tensor linear_cache_indices,
                          Tensor lxu_cache_state,
                          int64_t invalid_index,
                          bool gather_cache_stats,
                          c10::optional< Tensor > uvm_cache_stats )
                          +
                          + +
                          +
                          + +

                          ◆ lxu_cache_locations_update_cuda()

                          + +
                          +
                          + + + + + + + + + + + + + + + + +
                          DLL_PUBLIC void lxu_cache_locations_update_cuda (Tensor lxu_cache_locations,
                          Tensor lxu_cache_locations_new,
                          c10::optional< Tensor > num_uniq_cache_indices )
                          +
                          + +
                          +
                          + +

                          ◆ lxu_cache_lookup_cuda()

                          + +
                          +
                          + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                          DLL_PUBLIC Tensor lxu_cache_lookup_cuda (const Tensor linear_cache_indices,
                          const Tensor lxu_cache_state,
                          const int64_t invalid_index,
                          const bool gather_cache_stats,
                          const c10::optional< Tensor > uvm_cache_stats,
                          const c10::optional< Tensor > num_uniq_cache_indices,
                          const c10::optional< Tensor > lxu_cache_locations_output )
                          +
                          +

                          Lookup the cache locations for each linear cache indices in linear_cache_indices and return lxu_cache_locations

                          +

                          lxu_cache_locations A 1D tensor with the same length as linear_cache_indices. It contains the cache locations (the row indices in the cache) of the corresponding indices in linear_cache_indices, i.e., lxu_cache_locations[i] is the cache location for linear_cache_indices[i], where 0 <= i < linear_cache_indices.numel().

                          +
                          Parameters
                          + + + + + + + + +
                          linear_cache_indicesLinear cache indices tensor (1D)
                          lxu_cache_stateLXU cache state tensor (2D tensor of shape (# of cache sets, # of cache slots per set)). It contains linear indices of rows that are in the corresponding cache slots. If the cache slot is empty, a sentinel value is stored.
                          invalid_indexA sentinel value for linear cache indices. A cache index is skipped if it is a sentinel value.
                          gather_cache_statsA flag to enable/disable cache stats collection.
                          uvm_cache_statsA tensor for storing cache stats.
                          num_uniq_cache_indicesAn optional GPU tensor that contains the number of unique cache indices. If this tensor is passed, the kernel will only lookup num_uniq_cache_indices number of indices instead of looking up the entire linear_cache_indices.
                          lxu_cache_locations_outputAn optional output tensor. If the tensor is passed, the operator will not allocate a new output tensor and use this tensor as an output tensor.
                          +
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/memory__utils_2common_8cuh.html b/memory__utils_2common_8cuh.html new file mode 100644 index 000000000..4484a1a59 --- /dev/null +++ b/memory__utils_2common_8cuh.html @@ -0,0 +1,104 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/memory_utils/common.cuh File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          common.cuh File Reference
                          +
                          +
                          +
                          #include <ATen/ATen.h>
                          +#include <ATen/cuda/Exceptions.h>
                          +#include <c10/cuda/CUDAGuard.h>
                          +#include <sys/mman.h>
                          +#include <unistd.h>
                          +#include <cstring>
                          +#include "common.h"
                          +#include "fbgemm_gpu/cumem_utils.h"
                          +#include "fbgemm_gpu/enum_utils.h"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +
                          + + + + diff --git a/memory__utils_2common_8h.html b/memory__utils_2common_8h.html new file mode 100644 index 000000000..63335c801 --- /dev/null +++ b/memory__utils_2common_8h.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/memory_utils/common.h File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          common.h File Reference
                          +
                          +
                          +
                          #include <ATen/ATen.h>
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/memory__utils_8cpp.html b/memory__utils_8cpp.html new file mode 100644 index 000000000..40a678324 --- /dev/null +++ b/memory__utils_8cpp.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/memory_utils/memory_utils.cpp File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          memory_utils.cpp File Reference
                          +
                          +
                          +
                          #include "common.h"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/memory__utils_8cu.html b/memory__utils_8cu.html new file mode 100644 index 000000000..919723d21 --- /dev/null +++ b/memory__utils_8cu.html @@ -0,0 +1,169 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/memory_utils/memory_utils.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          memory_utils.cu File Reference
                          +
                          +
                          +
                          #include "common.cuh"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          + + + + + + + + + + + + + + + + + + + + + + + + + + + +

                          +Functions

                          Tensor new_managed_tensor (const Tensor &self, const std::vector< std::int64_t > &sizes)
                           
                          Tensor new_managed_tensor_meta (const Tensor &self, const std::vector< std::int64_t > &sizes)
                           
                          Tensor new_vanilla_managed_tensor (const Tensor &self, const std::vector< std::int64_t > &sizes)
                           
                          Tensor new_host_mapped_tensor (const Tensor &self, const std::vector< std::int64_t > &sizes)
                           
                          Tensor new_unified_tensor (const Tensor &self, const std::vector< std::int64_t > &sizes, bool is_host_mapped)
                           
                          bool uvm_storage (const Tensor &self)
                           
                          bool is_uvm_tensor (const Tensor &self)
                           
                          Tensor uvm_to_cpu (const Tensor &self)
                           
                          Tensor uvm_to_device (const Tensor &self, const Tensor &prototype)
                           
                          void uvm_cuda_mem_advise (const Tensor &self, int64_t cuda_memory_advise)
                           
                          void uvm_cuda_mem_prefetch_async (const Tensor &self, c10::optional< Tensor > device_t)
                           
                          void uvm_mem_advice_dont_fork (const Tensor &self)
                           
                          Tensor uvm_to_cpu_clone (const Tensor &self)
                           
                          +

                          Variable Documentation

                          + +

                          ◆ cuda_device_

                          + +
                          +
                          + + + + +
                          int cuda_device_
                          +
                          + +
                          +
                          + +

                          ◆ ptr_

                          + +
                          +
                          + + + + +
                          void* ptr_
                          +
                          + +
                          +
                          + +

                          ◆ storage_

                          + +
                          +
                          + + + + +
                          Storage storage_
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/memory__utils__ops_8cpp.html b/memory__utils__ops_8cpp.html new file mode 100644 index 000000000..ad57413cd --- /dev/null +++ b/memory__utils__ops_8cpp.html @@ -0,0 +1,113 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/memory_utils/memory_utils_ops.cpp File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          memory_utils_ops.cpp File Reference
                          +
                          +
                          +
                          #include <torch/library.h>
                          +#include "common.cuh"
                          +#include "fbgemm_gpu/sparse_ops_utils.h"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/memory__utils__ops_8cu.html b/memory__utils__ops_8cu.html new file mode 100644 index 000000000..068f0c0e0 --- /dev/null +++ b/memory__utils__ops_8cu.html @@ -0,0 +1,99 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/memory_utils/memory_utils_ops.cu File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          memory_utils_ops.cu File Reference
                          +
                          +
                          +
                          #include <torch/library.h>
                          +#include "common.cuh"
                          +#include "fbgemm_gpu/ops_utils.h"
                          +#include "fbgemm_gpu/sparse_ops_utils.h"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +
                          + + + + diff --git a/memory__utils__ops__cpu_8cpp.html b/memory__utils__ops__cpu_8cpp.html new file mode 100644 index 000000000..f3f7b476b --- /dev/null +++ b/memory__utils__ops__cpu_8cpp.html @@ -0,0 +1,113 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/memory_utils/memory_utils_ops_cpu.cpp File Reference + + + + + + + + + + + +
                          +
                          + + + + + + +
                          +
                          fbgemm_gpu +
                          +
                          +
                          + + + + + + + + +
                          +
                          + + +
                          +
                          +
                          +
                          +
                          +
                          Loading...
                          +
                          Searching...
                          +
                          No Matches
                          +
                          +
                          +
                          +
                          + + +
                          +
                          + +
                          memory_utils_ops_cpu.cpp File Reference
                          +
                          +
                          +
                          #include <torch/library.h>
                          +#include "common.h"
                          +#include "fbgemm_gpu/sparse_ops_utils.h"
                          +
                          + + + +

                          +Namespaces

                          namespace  fbgemm_gpu
                           
                          +

                          Typedef Documentation

                          + +

                          ◆ Tensor

                          + +
                          +
                          + + + + +
                          using Tensor = at::Tensor
                          +
                          + +
                          +
                          +
                          + + + + diff --git a/menu.js b/menu.js index b0b26936a..717761d01 100644 --- a/menu.js +++ b/menu.js @@ -24,13 +24,12 @@ */ function initMenu(relPath,searchEnabled,serverSide,searchPage,search) { function makeTree(data,relPath) { - var result=''; + let result=''; if ('children' in data) { result+='
                            '; - for (var i in data.children) { - var url; - var link; - link = data.children[i].url; + for (let i in data.children) { + let url; + const link = data.children[i].url; if (link.substring(0,1)=='^') { url = link.substring(1); } else { @@ -44,7 +43,7 @@ function initMenu(relPath,searchEnabled,serverSide,searchPage,search) { } return result; } - var searchBoxHtml; + let searchBoxHtml; if (searchEnabled) { if (serverSide) { searchBoxHtml='
                            '+ @@ -88,29 +87,28 @@ function initMenu(relPath,searchEnabled,serverSide,searchPage,search) { if (searchBoxHtml) { $('#main-menu').append('
                          • '); } - var $mainMenuState = $('#main-menu-state'); - var prevWidth = 0; + const $mainMenuState = $('#main-menu-state'); + let prevWidth = 0; if ($mainMenuState.length) { - function initResizableIfExists() { + const initResizableIfExists = function() { if (typeof initResizable==='function') initResizable(); } // animate mobile menu - $mainMenuState.change(function(e) { - var $menu = $('#main-menu'); - var options = { duration: 250, step: initResizableIfExists }; + $mainMenuState.change(function() { + const $menu = $('#main-menu'); + let options = { duration: 250, step: initResizableIfExists }; if (this.checked) { - options['complete'] = function() { $menu.css('display', 'block') }; + options['complete'] = () => $menu.css('display', 'block'); $menu.hide().slideDown(options); } else { - options['complete'] = function() { $menu.css('display', 'none') }; + options['complete'] = () => $menu.css('display', 'none'); $menu.show().slideUp(options); } }); // set default menu visibility - function resetState() { - var $menu = $('#main-menu'); - var $mainMenuState = $('#main-menu-state'); - var newWidth = $(window).outerWidth(); + const resetState = function() { + const $menu = $('#main-menu'); + const newWidth = $(window).outerWidth(); if (newWidth!=prevWidth) { if ($(window).outerWidth()<768) { $mainMenuState.prop('checked',false); $menu.hide(); diff --git a/menudata.js b/menudata.js index cf57d9b01..de1883c28 100644 --- a/menudata.js +++ b/menudata.js @@ -27,4 +27,164 @@ var menudata={children:[ {text:"Topics",url:"topics.html"}, {text:"Classes",url:"annotated.html",children:[ {text:"Class List",url:"annotated.html"}, -{text:"Class Index",url:"classes.html"}]}]} +{text:"Class Index",url:"classes.html"}, +{text:"Class Hierarchy",url:"hierarchy.html"}, +{text:"Class Members",url:"functions.html",children:[ +{text:"All",url:"functions.html",children:[ +{text:"a",url:"functions.html#index_a"}, +{text:"b",url:"functions_b.html#index_b"}, +{text:"c",url:"functions_c.html#index_c"}, +{text:"d",url:"functions_d.html#index_d"}, +{text:"e",url:"functions_e.html#index_e"}, +{text:"f",url:"functions_f.html#index_f"}, +{text:"g",url:"functions_g.html#index_g"}, +{text:"h",url:"functions_h.html#index_h"}, +{text:"i",url:"functions_i.html#index_i"}, +{text:"l",url:"functions_l.html#index_l"}, +{text:"m",url:"functions_m.html#index_m"}, +{text:"n",url:"functions_n.html#index_n"}, +{text:"o",url:"functions_o.html#index_o"}, +{text:"p",url:"functions_p.html#index_p"}, +{text:"r",url:"functions_r.html#index_r"}, +{text:"s",url:"functions_s.html#index_s"}, +{text:"t",url:"functions_t.html#index_t"}, +{text:"v",url:"functions_v.html#index_v"}, +{text:"w",url:"functions_w.html#index_w"}, +{text:"x",url:"functions_x.html#index_x"}, +{text:"~",url:"functions_~.html#index__7E"}]}, +{text:"Functions",url:"functions_func.html",children:[ +{text:"a",url:"functions_func.html#index_a"}, +{text:"b",url:"functions_func_b.html#index_b"}, +{text:"c",url:"functions_func_c.html#index_c"}, +{text:"d",url:"functions_func_d.html#index_d"}, +{text:"e",url:"functions_func_e.html#index_e"}, +{text:"f",url:"functions_func_f.html#index_f"}, +{text:"g",url:"functions_func_g.html#index_g"}, +{text:"i",url:"functions_func_i.html#index_i"}, +{text:"l",url:"functions_func_l.html#index_l"}, +{text:"m",url:"functions_func_m.html#index_m"}, +{text:"o",url:"functions_func_o.html#index_o"}, +{text:"r",url:"functions_func_r.html#index_r"}, +{text:"s",url:"functions_func_s.html#index_s"}, +{text:"t",url:"functions_func_t.html#index_t"}, +{text:"v",url:"functions_func_v.html#index_v"}, +{text:"w",url:"functions_func_w.html#index_w"}, +{text:"~",url:"functions_func_~.html#index__7E"}]}, +{text:"Variables",url:"functions_vars.html",children:[ +{text:"a",url:"functions_vars.html#index_a"}, +{text:"b",url:"functions_vars.html#index_b"}, +{text:"c",url:"functions_vars.html#index_c"}, +{text:"d",url:"functions_vars.html#index_d"}, +{text:"f",url:"functions_vars.html#index_f"}, +{text:"g",url:"functions_vars.html#index_g"}, +{text:"h",url:"functions_vars.html#index_h"}, +{text:"i",url:"functions_vars.html#index_i"}, +{text:"l",url:"functions_vars.html#index_l"}, +{text:"n",url:"functions_vars.html#index_n"}, +{text:"p",url:"functions_vars.html#index_p"}, +{text:"r",url:"functions_vars.html#index_r"}, +{text:"s",url:"functions_vars.html#index_s"}, +{text:"v",url:"functions_vars.html#index_v"}, +{text:"w",url:"functions_vars.html#index_w"}, +{text:"x",url:"functions_vars.html#index_x"}]}, +{text:"Typedefs",url:"functions_type.html"}, +{text:"Enumerator",url:"functions_eval.html"}]}]}, +{text:"Files",url:"files.html",children:[ +{text:"File List",url:"files.html"}, +{text:"File Members",url:"globals.html",children:[ +{text:"All",url:"globals.html",children:[ +{text:"_",url:"globals.html#index__5F"}, +{text:"a",url:"globals_a.html#index_a"}, +{text:"b",url:"globals_b.html#index_b"}, +{text:"c",url:"globals_c.html#index_c"}, +{text:"d",url:"globals_d.html#index_d"}, +{text:"e",url:"globals_e.html#index_e"}, +{text:"f",url:"globals_f.html#index_f"}, +{text:"g",url:"globals_g.html#index_g"}, +{text:"h",url:"globals_h.html#index_h"}, +{text:"i",url:"globals_i.html#index_i"}, +{text:"j",url:"globals_j.html#index_j"}, +{text:"k",url:"globals_k.html#index_k"}, +{text:"l",url:"globals_l.html#index_l"}, +{text:"m",url:"globals_m.html#index_m"}, +{text:"n",url:"globals_n.html#index_n"}, +{text:"o",url:"globals_o.html#index_o"}, +{text:"p",url:"globals_p.html#index_p"}, +{text:"q",url:"globals_q.html#index_q"}, +{text:"r",url:"globals_r.html#index_r"}, +{text:"s",url:"globals_s.html#index_s"}, +{text:"t",url:"globals_t.html#index_t"}, +{text:"u",url:"globals_u.html#index_u"}, +{text:"v",url:"globals_v.html#index_v"}, +{text:"w",url:"globals_w.html#index_w"}, +{text:"x",url:"globals_x.html#index_x"}, +{text:"y",url:"globals_y.html#index_y"}]}, +{text:"Functions",url:"globals_func.html",children:[ +{text:"_",url:"globals_func.html#index__5F"}, +{text:"a",url:"globals_func_a.html#index_a"}, +{text:"b",url:"globals_func_b.html#index_b"}, +{text:"c",url:"globals_func_c.html#index_c"}, +{text:"d",url:"globals_func_d.html#index_d"}, +{text:"e",url:"globals_func_e.html#index_e"}, +{text:"f",url:"globals_func_f.html#index_f"}, +{text:"g",url:"globals_func_g.html#index_g"}, +{text:"h",url:"globals_func_h.html#index_h"}, +{text:"i",url:"globals_func_i.html#index_i"}, +{text:"k",url:"globals_func_k.html#index_k"}, +{text:"l",url:"globals_func_l.html#index_l"}, +{text:"m",url:"globals_func_m.html#index_m"}, +{text:"p",url:"globals_func_p.html#index_p"}, +{text:"r",url:"globals_func_r.html#index_r"}, +{text:"s",url:"globals_func_s.html#index_s"}, +{text:"t",url:"globals_func_t.html#index_t"}, +{text:"w",url:"globals_func_w.html#index_w"}]}, +{text:"Variables",url:"globals_vars.html",children:[ +{text:"b",url:"globals_vars.html#index_b"}, +{text:"c",url:"globals_vars_c.html#index_c"}, +{text:"d",url:"globals_vars_d.html#index_d"}, +{text:"e",url:"globals_vars_e.html#index_e"}, +{text:"f",url:"globals_vars_f.html#index_f"}, +{text:"g",url:"globals_vars_g.html#index_g"}, +{text:"h",url:"globals_vars_h.html#index_h"}, +{text:"i",url:"globals_vars_i.html#index_i"}, +{text:"k",url:"globals_vars_k.html#index_k"}, +{text:"l",url:"globals_vars_l.html#index_l"}, +{text:"m",url:"globals_vars_m.html#index_m"}, +{text:"n",url:"globals_vars_n.html#index_n"}, +{text:"o",url:"globals_vars_o.html#index_o"}, +{text:"p",url:"globals_vars_p.html#index_p"}, +{text:"r",url:"globals_vars_r.html#index_r"}, +{text:"s",url:"globals_vars_s.html#index_s"}, +{text:"t",url:"globals_vars_t.html#index_t"}, +{text:"u",url:"globals_vars_u.html#index_u"}, +{text:"v",url:"globals_vars_v.html#index_v"}, +{text:"w",url:"globals_vars_w.html#index_w"}]}, +{text:"Typedefs",url:"globals_type.html",children:[ +{text:"a",url:"globals_type.html#index_a"}, +{text:"c",url:"globals_type_c.html#index_c"}, +{text:"l",url:"globals_type_l.html#index_l"}, +{text:"n",url:"globals_type_n.html#index_n"}, +{text:"o",url:"globals_type_o.html#index_o"}, +{text:"t",url:"globals_type_t.html#index_t"}, +{text:"v",url:"globals_type_v.html#index_v"}]}, +{text:"Enumerations",url:"globals_enum.html"}, +{text:"Enumerator",url:"globals_eval.html"}, +{text:"Macros",url:"globals_defs.html",children:[ +{text:"_",url:"globals_defs.html#index__5F"}, +{text:"a",url:"globals_defs_a.html#index_a"}, +{text:"c",url:"globals_defs_c.html#index_c"}, +{text:"d",url:"globals_defs_d.html#index_d"}, +{text:"f",url:"globals_defs_f.html#index_f"}, +{text:"h",url:"globals_defs_h.html#index_h"}, +{text:"i",url:"globals_defs_i.html#index_i"}, +{text:"j",url:"globals_defs_j.html#index_j"}, +{text:"l",url:"globals_defs_l.html#index_l"}, +{text:"m",url:"globals_defs_m.html#index_m"}, +{text:"n",url:"globals_defs_n.html#index_n"}, +{text:"p",url:"globals_defs_p.html#index_p"}, +{text:"q",url:"globals_defs_q.html#index_q"}, +{text:"s",url:"globals_defs_s.html#index_s"}, +{text:"t",url:"globals_defs_t.html#index_t"}, +{text:"w",url:"globals_defs_w.html#index_w"}, +{text:"x",url:"globals_defs_x.html#index_x"}, +{text:"y",url:"globals_defs_y.html#index_y"}]}]}]}]} diff --git a/merge__pooled__embedding__ops__cpu_8cpp.html b/merge__pooled__embedding__ops__cpu_8cpp.html new file mode 100644 index 000000000..4f3e00ce4 --- /dev/null +++ b/merge__pooled__embedding__ops__cpu_8cpp.html @@ -0,0 +1,164 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_cpu.cpp File Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + + +
                            +
                            + +
                            merge_pooled_embedding_ops_cpu.cpp File Reference
                            +
                            +
                            +
                            #include <ATen/ATen.h>
                            +#include <ATen/core/op_registration/op_registration.h>
                            +#include <c10/core/TensorOptions.h>
                            +#include <torch/library.h>
                            +#include "fbgemm_gpu/dispatch_macros.h"
                            +#include "fbgemm_gpu/ops_utils.h"
                            +
                            + + + +

                            +Namespaces

                            namespace  fbgemm_gpu
                             
                            +

                            Typedef Documentation

                            + +

                            ◆ Tensor

                            + +
                            +
                            + + + + +
                            using Tensor = at::Tensor
                            +
                            + +
                            +
                            +

                            Function Documentation

                            + +

                            ◆ FBGEMM_OP_DISPATCH()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            FBGEMM_OP_DISPATCH (CPU ,
                            "merge_pooled_embeddings" ,
                            fbgemm_gpu::merge_pooled_embeddings_cpu  )
                            +
                            + +
                            +
                            + +

                            ◆ TORCH_LIBRARY_FRAGMENT()

                            + +
                            +
                            + + + + + + + + + + + +
                            TORCH_LIBRARY_FRAGMENT (fbgemm ,
                            m  )
                            +
                            + +
                            +
                            +
                            + + + + diff --git a/merge__pooled__embedding__ops__gpu_8cpp.html b/merge__pooled__embedding__ops__gpu_8cpp.html new file mode 100644 index 000000000..34ef09d80 --- /dev/null +++ b/merge__pooled__embedding__ops__gpu_8cpp.html @@ -0,0 +1,147 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_gpu.cpp File Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + + +
                            +
                            + +
                            merge_pooled_embedding_ops_gpu.cpp File Reference
                            +
                            +
                            +
                            #include <ATen/ATen.h>
                            +#include <ATen/core/op_registration/op_registration.h>
                            +#include <ATen/cuda/CUDAContext.h>
                            +#include <ATen/cuda/CUDAEvent.h>
                            +#include <ATen/cuda/PeerToPeerAccess.h>
                            +#include <c10/core/Device.h>
                            +#include <c10/core/TensorOptions.h>
                            +#include <c10/cuda/CUDAGuard.h>
                            +#include <c10/util/irange.h>
                            +#include <torch/library.h>
                            +#include <algorithm>
                            +#include <tuple>
                            +#include "fbgemm_gpu/merge_pooled_embeddings.h"
                            +#include "fbgemm_gpu/sparse_ops_utils.h"
                            +#include "fbgemm_gpu/topology_utils.h"
                            +
                            + + + +

                            +Namespaces

                            namespace  fbgemm_gpu
                             
                            +

                            Typedef Documentation

                            + +

                            ◆ Tensor

                            + +
                            +
                            + + + + +
                            using Tensor = at::Tensor
                            +
                            + +
                            +
                            +

                            Function Documentation

                            + +

                            ◆ TORCH_LIBRARY_FRAGMENT()

                            + +
                            +
                            + + + + + + + + + + + +
                            TORCH_LIBRARY_FRAGMENT (fbgemm ,
                            m  )
                            +
                            + +
                            +
                            +
                            + + + + diff --git a/merge__pooled__embeddings_8h.html b/merge__pooled__embeddings_8h.html new file mode 100644 index 000000000..b1d9b0a7c --- /dev/null +++ b/merge__pooled__embeddings_8h.html @@ -0,0 +1,96 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/merge_pooled_embeddings.h File Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + + +
                            +
                            + +
                            merge_pooled_embeddings.h File Reference
                            +
                            +
                            +
                            #include <ATen/ATen.h>
                            +
                            + + + +

                            +Namespaces

                            namespace  fbgemm_gpu
                             
                            +
                            + + + + diff --git a/metric__ops_8cu.html b/metric__ops_8cu.html new file mode 100644 index 000000000..74ceb8bd7 --- /dev/null +++ b/metric__ops_8cu.html @@ -0,0 +1,192 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/metric_ops/metric_ops.cu File Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + + +
                            +
                            + +
                            metric_ops.cu File Reference
                            +
                            +
                            +
                            #include <ATen/ceil_div.h>
                            +#include <ATen/cuda/CUDAContext.h>
                            +#include <c10/cuda/CUDAGuard.h>
                            +#include <math.h>
                            +#include <ATen/cuda/Atomic.cuh>
                            +#include <algorithm>
                            +#include "fbgemm_gpu/dispatch_macros.h"
                            +#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
                            +#include "metric_ops.h"
                            +
                            + + + +

                            +Namespaces

                            namespace  fbgemm_gpu
                             
                            +

                            Macro Definition Documentation

                            + +

                            ◆ LAUNCH_AUC_KERNEL

                            + +
                            +
                            + + + + + + + +
                            #define LAUNCH_AUC_KERNEL( pad)
                            +
                            +Value:
                            typedef cub::BlockScan<acc_t, NUM_THREADS_PER_BLOCK> BlockScan; \
                            +
                            TORCH_CHECK( \
                            +
                            sizeof(BlockScan::TempStorage) + \
                            +
                            ((MAX_ENTRIES_PER_BLOCK * 2 + 3) * sizeof(acc_t)) <= \
                            +
                            max_smem_size) \
                            +
                            auc_kernel<index_t, label_t, scalar_t, acc_t, pad> \
                            +
                            <<<dim3(grid_size), \
                            + +
                            0, \
                            +
                            at::cuda::getCurrentCUDAStream()>>>( \
                            +
                            output.data_ptr<acc_t>(), \
                            +
                            indices.data_ptr<index_t>(), \
                            +
                            labels.data_ptr<label_t>(), \
                            +
                            weights.data_ptr<scalar_t>(), \
                            +
                            num_blocks > 1 ? block_flags.data_ptr<int>() : nullptr, \
                            +
                            num_blocks > 1 ? block_sums.data_ptr<acc_t>() : nullptr, \
                            +
                            num_entries, \
                            +
                            last_block_num_entries, \
                            +
                            padded_num_entries_per_block, \
                            +
                            num_blocks); \
                            +
                            C10_CUDA_KERNEL_LAUNCH_CHECK();
                            +
                            template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const int32_t const bool pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > output
                            Definition gen_batch_index_select_dim0_forward_kernel_small.cu:128
                            +
                            template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices
                            Definition gen_batch_index_select_dim0_forward_kernel_small.cu:123
                            +
                            constexpr int MAX_ENTRIES_PER_BLOCK
                            Definition metric_ops.cu:20
                            +
                            constexpr int NUM_THREADS_PER_BLOCK
                            Definition metric_ops.cu:21
                            +
                            +
                            +
                            +

                            Variable Documentation

                            + +

                            ◆ MAX_ENTRIES_PER_BLOCK

                            + +
                            +
                            + + + + + +
                            + + + + +
                            constexpr int MAX_ENTRIES_PER_BLOCK = 512
                            +
                            +constexpr
                            +
                            + +
                            +
                            + +

                            ◆ NUM_THREADS_PER_BLOCK

                            + +
                            +
                            + + + + + +
                            + + + + +
                            constexpr int NUM_THREADS_PER_BLOCK = 256
                            +
                            +constexpr
                            +
                            + +
                            +
                            +
                            + + + + diff --git a/metric__ops_8h.html b/metric__ops_8h.html new file mode 100644 index 000000000..3dae855da --- /dev/null +++ b/metric__ops_8h.html @@ -0,0 +1,96 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/metric_ops/metric_ops.h File Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + + +
                            +
                            + +
                            metric_ops.h File Reference
                            +
                            +
                            +
                            #include <ATen/ATen.h>
                            +
                            + + + +

                            +Namespaces

                            namespace  fbgemm_gpu
                             
                            +
                            + + + + diff --git a/metric__ops__host_8cpp.html b/metric__ops__host_8cpp.html new file mode 100644 index 000000000..15b6b87e1 --- /dev/null +++ b/metric__ops__host_8cpp.html @@ -0,0 +1,99 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/metric_ops/metric_ops_host.cpp File Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + + +
                            +
                            + +
                            metric_ops_host.cpp File Reference
                            +
                            +
                            +
                            #include <ATen/core/op_registration/op_registration.h>
                            +#include <torch/library.h>
                            +#include "fbgemm_gpu/sparse_ops_utils.h"
                            +#include "metric_ops.h"
                            +
                            + + + +

                            +Namespaces

                            namespace  fbgemm_gpu
                             
                            +
                            + + + + diff --git a/namespacefbgemm__gpu.html b/namespacefbgemm__gpu.html new file mode 100644 index 000000000..f9fd26442 --- /dev/null +++ b/namespacefbgemm__gpu.html @@ -0,0 +1,13947 @@ + + + + + + + +fbgemm_gpu: fbgemm_gpu Namespace Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            + +
                            fbgemm_gpu Namespace Reference
                            +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

                            +Classes

                            struct  BitonicSort
                             
                            struct  Comparator
                             
                            struct  DefaultPtrTraits
                             
                            class  enum_registration
                             
                            class  FixedDivisor
                             
                            class  GenericPackedTensorAccessor
                             
                            class  GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >
                             
                            class  GenericPackedTensorAccessorBase
                             
                            struct  Half4
                             
                            class  PermutePooledEmbsFunction
                             
                            class  PermutePooledEmbsFunctionSplit
                             
                            struct  rk_state
                             
                            struct  SharedMemory
                             
                            struct  SharedMemory< double >
                             
                            struct  SharedMemory< float >
                             
                            struct  SharedMemory< int32_t >
                             
                            struct  SharedMemory< int64_t >
                             
                            struct  SharedMemory< Vec4T< at::acc_type< double, true > > >
                             
                            struct  SharedMemory< Vec4T< at::acc_type< float, true > > >
                             
                            struct  StochasticRoundingRNGState
                             
                            class  TensorAccessor
                             
                            class  TensorAccessor< T, 1, PtrTraits, index_t >
                             
                            class  TensorAccessorBase
                             
                            struct  Vec4AccT
                             
                            struct  Vec4StepT
                             
                            struct  Vec4StepT< STEP, at::Half >
                             
                            struct  Vec4StepT< STEP, float >
                             
                            struct  Vec4StepT< STEP, uint8_t >
                             
                            struct  Vec4T
                             
                            struct  Vec4T< at::BFloat16 >
                             
                            struct  Vec4T< at::Half >
                             
                            struct  Vec4T< double >
                             
                            struct  Vec4T< float >
                             
                            struct  VecNT
                             
                            struct  VecNT< 1, PrimitiveType::FP >
                             
                            struct  VecNT< 16, PrimitiveType::INT >
                             
                            struct  VecNT< 2, PrimitiveType::FP >
                             
                            struct  VecNT< 4, PrimitiveType::FP >
                             
                            struct  VecNT< 4, PrimitiveType::INT >
                             
                            struct  VecNT< 8, PrimitiveType::INT >
                             
                            struct  WeightRow
                             
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

                            +Functions

                            template<typename IndexType >
                            void report_embedding_error (int t, int B, int b_begin, int b_end, const IndexType *offsets_data, const IndexType *indices_data, int64_t hash_size, bool allow_minus_one=false)
                             
                            Tensor new_managed_tensor (const Tensor &self, const std::vector< std::int64_t > &sizes)
                             
                            Tensor new_managed_tensor_meta (const Tensor &self, const std::vector< std::int64_t > &sizes)
                             
                            Tensor new_host_mapped_tensor (const Tensor &self, const std::vector< std::int64_t > &sizes)
                             
                            Tensor new_unified_tensor (const Tensor &self, const std::vector< std::int64_t > &sizes, bool is_host_mapped)
                             
                            Tensor new_vanilla_managed_tensor (const Tensor &self, const std::vector< std::int64_t > &sizes)
                             
                            bool uvm_storage (const Tensor &self)
                             
                            bool is_uvm_tensor (const Tensor &self)
                             
                            Tensor uvm_to_cpu (const Tensor &self)
                             
                            Tensor uvm_to_device (const Tensor &self, const Tensor &prototype)
                             
                            void uvm_cuda_mem_advise (const Tensor &self, int64_t cuda_memory_advise)
                             
                            void uvm_cuda_mem_prefetch_async (const Tensor &self, c10::optional< Tensor > device_t)
                             
                            void uvm_mem_advice_dont_fork (const Tensor &self)
                             
                            Tensor uvm_to_cpu_clone (const Tensor &self)
                             
                            void embedding_inplace_update_cuda (Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor update_weights, Tensor update_table_idx, Tensor update_row_idx, Tensor update_offsets, const int64_t row_alignment, c10::optional< Tensor > lxu_cache_weights=c10::nullopt, c10::optional< Tensor > lxu_cache_locations=c10::nullopt)
                             
                            Tensor pruned_array_lookup_from_row_idx_cuda (const Tensor &update_row_indices, const Tensor &update_table_indices, const Tensor &index_remappings, const Tensor &index_remappings_offsets)
                             
                            template<typename scalar_t , int ITEMS_PER_THREAD, int NUM_THREADS_PER_BLOCK>
                            __inline__ __device__ void inclusive_sum_scan_kernel (scalar_t(&arr)[ITEMS_PER_THREAD], typename cub::BlockScan< scalar_t, NUM_THREADS_PER_BLOCK >::TempStorage &temp_storage, int *block_flags, volatile scalar_t *block_sums, scalar_t *block_prev, const int num_entries_per_block, const int block_id, const bool is_multi_block, const int signal)
                             
                            at::Tensor expand_into_jagged_permute_cuda (const at::Tensor &permute, const at::Tensor &input_offsets, const at::Tensor &output_offsets, int64_t output_size)
                             
                            std::tuple< at::Tensor, at::Tensor > histogram_binning_calibration_cpu (const at::Tensor &logit, const at::Tensor &bin_num_examples, const at::Tensor &bin_num_positives, double positive_weight, double lower_bound=0.0, double upper_bound=1.0, int64_t bin_ctr_in_use_after=0, double bin_ctr_weight_value=1.0)
                             
                            std::tuple< at::Tensor, at::Tensor > generic_histogram_binning_calibration_by_feature_cpu (const at::Tensor &logit, const at::Tensor &segment_value, const at::Tensor &segment_lengths, int64_t num_segments, const at::Tensor &bin_num_examples, const at::Tensor &bin_num_positives, const at::Tensor &bin_boundaries, double positive_weight, int64_t bin_ctr_in_use_after=0, double bin_ctr_weight_value=1.0)
                             
                            std::tuple< Tensor, Tensor, Tensorpadding_fused_tbe_input_combine_with_length_cpu (const std::vector< Tensor > &indices_list, const std::vector< Tensor > &lengths_list, const std::vector< Tensor > &per_sample_weights, int64_t batch_size)
                             
                            template<int NUM_JAGGED_DIM, typename index_t , typename scalar_t , typename F >
                            __global__ __launch_bounds__ (kMaxThreads) void jagged_jagged_elementwise_dense_output_kernel_(const pta
                             
                            Tensor jagged_index_add_2d_forward_cuda (const Tensor &values, const Tensor &indices, const Tensor &input_offsets, const Tensor &output_offsets, const int64_t num_dense_input_rows, const int64_t num_output_rows)
                             
                            Tensor jagged_index_select_2d_forward_cuda (const Tensor &values, const Tensor &indices, const Tensor &input_offsets, const Tensor &output_offsets, const int64_t num_dense_output_rows)
                             
                            Tensor jagged_dense_elementwise_add (const Tensor &x_values, const std::vector< Tensor > &x_offsets, const Tensor &y)
                             
                            std::tuple< Tensor, std::vector< Tensor > > jagged_dense_elementwise_add_jagged_output (const Tensor &x_values, const std::vector< Tensor > &x_offsets, const Tensor &y)
                             
                            std::vector< Tensorjagged_index_select_2d (const Tensor &values, const Tensor &lengths, const Tensor &indices)
                             
                            Tensor jagged_index_select_2d_forward_cpu (const Tensor &values, const Tensor &indices, const Tensor &input_offsets, const Tensor &output_offsets, const int64_t num_dense_output_rows)
                             
                            Tensor jagged_index_add_2d_forward_cpu (const Tensor &values, const Tensor &indices, const Tensor &input_offsets, const Tensor &output_offsets, const int64_t num_dense_input_rows, const int64_t num_output_rows)
                             
                            Tensor jagged_slice_forward_cpu (const Tensor &x_values, const Tensor &x_lengths, const Tensor &src_start, const Tensor &output_lengths, const Tensor &tgt_start, const int64_t num_output_rows, const int64_t slice_length, const bool fill_zeros)
                             
                            std::tuple< Tensor, std::vector< Tensor > > jagged_dense_elementwise_add_jagged_output_cuda (const Tensor &x_values, const std::vector< Tensor > &x_offsets, const Tensor &y)
                             
                            DLL_PUBLIC at::Tensor _float_to_bfloat16_gpu (const at::Tensor &input)
                             
                            DLL_PUBLIC at::Tensor _bfloat16_to_float_gpu (const at::Tensor &input)
                             
                            DLL_PUBLIC at::Tensor _float_to_hfp8_gpu (const at::Tensor &input, const int64_t ebits, const int64_t exponent_bias, const double max_pos)
                             
                            DLL_PUBLIC at::Tensor _hfp8_to_float_gpu (const at::Tensor &input, const int64_t ebits, const int64_t exponent_bias)
                             
                            DLL_PUBLIC at::Tensor _float_to_msfp_gpu (const at::Tensor &input, const int64_t bounding_box_size, const int64_t ebits, const int64_t mbits, const int64_t bias, const double min_pos, const double max_pos)
                             
                            DLL_PUBLIC at::Tensor _msfp_to_float_gpu (const at::Tensor &input, const int64_t ebits, const int64_t mbits, const int64_t bias)
                             
                            Tensor batched_unary_embeddings_forward_cpu (const Tensor &weight, const Tensor &table_offsets, const Tensor &offsets, const Tensor &indices)
                             
                            Tensor pack_segments_forward_cpu (const Tensor &t_in, const Tensor &lengths, const int64_t max_length)
                             
                            Tensor pack_segments_backward_cpu (const Tensor &data, const Tensor &lengths, const int64_t total_length, const int64_t max_length)
                             
                            DLL_PUBLIC Tensor pack_segments_backward_cuda (const Tensor &data, const Tensor &lengths, int64_t total_length, int64_t max_length)
                             
                            DLL_PUBLIC Tensor pack_segments_forward_cuda (const Tensor &t_in, const Tensor &lengths, const int64_t max_length)
                             
                            +

                            Typedef Documentation

                            + +

                            ◆ enum_item

                            + +
                            +
                            + + + + +
                            using enum_item = std::tuple<std::string, int64_t>
                            +
                            + +
                            +
                            + +

                            ◆ enum_items

                            + +
                            +
                            + + + + +
                            using enum_items = std::vector<enum_item>
                            +
                            + +
                            +
                            + +

                            ◆ enum_result

                            + +
                            +
                            + + + + +
                            using enum_result
                            +
                            +Initial value:
                            std::vector<
                            +
                            std::tuple<std::string, std::vector<std::tuple<std::string, int64_t>>>>
                            +
                            +
                            +
                            + +

                            ◆ fint32

                            + +
                            +
                            + + + + +
                            using fint32
                            +
                            +Initial value:
                            union fint32 {
                            + +
                            float F;
                            +
                            }
                            +
                            template __global__ uint32_t
                            Definition gen_embedding_backward_split_grad.cu:137
                            +
                            union fint32 { uint32_t I; float F;} fint32
                            Definition quantize_ops_utils.h:24
                            +
                            +
                            +
                            + +

                            ◆ PackedTensorAccessor32

                            + +
                            +
                            +
                            +template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits>
                            + + + + +
                            using PackedTensorAccessor32
                            +
                            +Initial value:
                            +
                            GenericPackedTensorAccessor<T, N, PtrTraits, int32_t>
                            +
                            +
                            +
                            + +

                            ◆ PackedTensorAccessor64

                            + +
                            +
                            +
                            +template<typename T , size_t N, template< typename U > class PtrTraits = DefaultPtrTraits>
                            + + + + +
                            using PackedTensorAccessor64
                            +
                            +Initial value:
                            +
                            GenericPackedTensorAccessor<T, N, PtrTraits, int64_t>
                            +
                            +
                            +
                            + +

                            ◆ Tensor

                            + +
                            +
                            + + + + +
                            typedef at::Tensor Tensor = at::Tensor
                            +
                            + +
                            +
                            + +

                            ◆ uoffset_t

                            + +
                            +
                            + + + + +
                            using uoffset_t = std::make_unsigned_t<offset_t>
                            +
                            + +
                            +
                            +

                            Enumeration Type Documentation

                            + +

                            ◆ args_pos

                            + +
                            +
                            + + + + +
                            enum args_pos
                            +
                            + + + + + + + + +
                            Enumerator
                            P_indices_prts 
                            P_lengths_addrs 
                            P_indices_offsets 
                            P_lengths_offsets 
                            P_per_sample_weight 
                            P_indices_is_long 
                            P_lengths_is_long 
                            + +
                            +
                            + +

                            ◆ BoundsCheckMode

                            + +
                            +
                            + + + + + +
                            + + + + +
                            enum class BoundsCheckMode : uint8_t
                            +
                            +strong
                            +
                            + + + + +
                            Enumerator
                            FATAL 
                            WARNING 
                            IGNORE 
                            + +
                            +
                            + +

                            ◆ PlacementType

                            + +
                            +
                            + + + + + +
                            + + + + +
                            enum class PlacementType : uint8_t
                            +
                            +strong
                            +
                            + + + + + +
                            Enumerator
                            DEVICE 
                            MANAGED 
                            MANAGED_CACHING 
                            HOST 
                            + +
                            +
                            + +

                            ◆ PoolingMode

                            + +
                            +
                            + + + + + +
                            + + + + +
                            enum class PoolingMode : uint8_t
                            +
                            +strong
                            +
                            + + + + +
                            Enumerator
                            SUM 
                            MEAN 
                            NONE 
                            + +
                            +
                            + +

                            ◆ PrimitiveType

                            + +
                            +
                            + + + + + +
                            + + + + +
                            enum class PrimitiveType : uint8_t
                            +
                            +strong
                            +
                            + + + + +
                            Enumerator
                            FP 
                            INT 
                            BF 
                            + +
                            +
                            + +

                            ◆ SparseType

                            + +
                            +
                            + + + + + +
                            + + + + +
                            enum class SparseType : uint8_t
                            +
                            +strong
                            +
                            + + + + + + + + + +
                            Enumerator
                            FP32 
                            FP16 
                            INT8 
                            INT4 
                            INT2 
                            BF16 
                            FP8 
                            INVALID 
                            + +
                            +
                            + +

                            ◆ uvm_cache_stats_index

                            + +
                            +
                            + + + + + + + +
                            Enumerator
                            num_calls 
                            num_requested_indices 
                            num_unique_indices 
                            num_unique_misses 
                            num_conflict_unique_misses 
                            num_conflict_misses 
                            + +
                            +
                            +

                            Function Documentation

                            + +

                            ◆ __align__() [1/4]

                            + +
                            +
                            + + + + + + + +
                            struct __align__ (16 )
                            +
                            + +
                            +
                            + +

                            ◆ __align__() [2/4]

                            + +
                            +
                            + + + + + + + +
                            struct __align__ (32 )
                            +
                            + +
                            +
                            + +

                            ◆ __align__() [3/4]

                            + +
                            +
                            + + + + + + + +
                            struct __align__ (64 )
                            +
                            + +
                            +
                            + +

                            ◆ __align__() [4/4]

                            + +
                            +
                            + + + + + + + +
                            struct __align__ (8 )
                            +
                            + +
                            +
                            + +

                            ◆ __launch_bounds__() [1/7]

                            + +
                            +
                            +
                            +template<typename index_t >
                            + + + + + + + +
                            __launch_bounds__ (kMaxThreads )
                            +
                            + +
                            +
                            + +

                            ◆ __launch_bounds__() [2/7]

                            + +
                            +
                            +
                            +template<bool sequence, bool has_weight, bool bucketize_pos, typename offset_t , typename index_t , typename scalar_t >
                            + + + + + + + +
                            __global__ __launch_bounds__ (kMaxThreads )
                            +
                            + +
                            +
                            + +

                            ◆ __launch_bounds__() [3/7]

                            + +
                            +
                            +
                            +template<const int BLOCK_TILE_M, const int BLOCK_TILE_N, const int BLOCK_TILE_K, const int THREAD_TILE_M, const int THREAD_TILE_N, typename index_t , typename scalar_t >
                            + + + + + + + +
                            __global__ __launch_bounds__ (kMaxThreads ) const
                            +
                            + +
                            +
                            + +

                            ◆ __launch_bounds__() [4/7]

                            + +
                            +
                            + + + + + + + + +
                            __global__ __launch_bounds__ (kMaxThreads )
                            +
                            + +
                            +
                            + +

                            ◆ __launch_bounds__() [5/7]

                            + +
                            +
                            +
                            +template<int NUM_JAGGED_DIM, typename index_t , typename scalar_t , typename F >
                            + + + + + + + +
                            __global__ __launch_bounds__ (kMaxThreads ) const
                            +
                            +

                            output = f(x, y) where x and y are jagged (and share x_offsets), and output is dense.

                            +
                            Parameters
                            + + +
                            padding_valuepadding_value for the output, not for inputs
                            +
                            +
                            + +
                            +
                            + +

                            ◆ __launch_bounds__() [6/7]

                            + +
                            +
                            +
                            +template<typename index_t >
                            + + + + + + + +
                            __global__ __launch_bounds__ (kMaxThreads ) const
                            +
                            + +
                            +
                            + +

                            ◆ __launch_bounds__() [7/7]

                            + +
                            +
                            + + + + + + + + +
                            __global__ __launch_bounds__ (kMaxThreads ) const
                            +
                            + +
                            +
                            + +

                            ◆ _bfloat16_to_float_cpu()

                            + +
                            +
                            + + + + + + + +
                            at::Tensor _bfloat16_to_float_cpu (const at::Tensor & input)
                            +
                            + +
                            +
                            + +

                            ◆ _block_bucketize_sparse_features_cpu()

                            + +
                            +
                            +
                            +template<bool sequence, bool has_weight, typename offset_t , typename index_t , typename scalar_t >
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void _block_bucketize_sparse_features_cpu (const Tensor & lengths,
                            const Tensor & indices,
                            const c10::optional< Tensor > & weights,
                            const bool bucketize_pos,
                            const Tensor & block_sizes,
                            const int64_t my_size,
                            Tensor new_lengths,
                            Tensor new_indices,
                            c10::optional< Tensor > new_weights,
                            c10::optional< Tensor > new_pos,
                            const c10::optional< Tensor > & unbucketize_permute,
                            const c10::optional< Tensor > & batch_size_per_feature,
                            const c10::optional< std::vector< at::Tensor > > & block_bucketize_pos )
                            +
                            + +
                            +
                            + +

                            ◆ _bucketize_sparse_features_cpu()

                            + +
                            +
                            +
                            +template<bool has_weight, typename index_t , typename scalar_t >
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void _bucketize_sparse_features_cpu (const at::Tensor & lengths,
                            const at::Tensor & indices,
                            const c10::optional< at::Tensor > & weights,
                            const bool bucketize_pos,
                            const int64_t my_size,
                            at::Tensor & new_lengths,
                            at::Tensor & new_indices,
                            c10::optional< at::Tensor > new_weights,
                            c10::optional< at::Tensor > new_pos )
                            +
                            + +
                            +
                            + +

                            ◆ _cat_int_tensors()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor _cat_int_tensors (const std::vector< Tensor > & tensor_list,
                            int64_t total_num,
                            bool use_pin_memory )
                            +
                            + +
                            +
                            + +

                            ◆ _cat_int_tensors_with_padding()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor _cat_int_tensors_with_padding (const std::vector< Tensor > & tensor_list,
                            int64_t total_num,
                            bool use_pin_memory,
                            int64_t batch_size )
                            +
                            + +
                            +
                            + +

                            ◆ _cat_per_sample_weights_list()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor _cat_per_sample_weights_list (const std::vector< Tensor > & per_sample_weights,
                            const std::vector< Tensor > & indices_list,
                            int64_t total_num,
                            bool use_pin_memory )
                            +
                            + +
                            +
                            + +

                            ◆ _expand_into_jagged_permute_cpu_kernel()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void _expand_into_jagged_permute_cpu_kernel (const offsets_t *const __restrict__ input_offsets,
                            const offsets_t *const __restrict__ output_offsets,
                            const int64_t permute_size,
                            const index_t *const __restrict__ permute,
                            index_t *const __restrict__ output_permute )
                            +
                            + +
                            +
                            + +

                            ◆ _float_to_bfloat16_cpu()

                            + +
                            +
                            + + + + + + + +
                            at::Tensor _float_to_bfloat16_cpu (const at::Tensor & input)
                            +
                            + +
                            +
                            + +

                            ◆ _float_to_FP8rowwise_gpu_t()

                            + +
                            +
                            +
                            +template<typename input_t >
                            + + + + + + + + + + + +
                            Tensor _float_to_FP8rowwise_gpu_t (const Tensor & input,
                            const bool forward )
                            +
                            + +
                            +
                            + +

                            ◆ _float_to_fused8bitrowwise_cpu_out_t()

                            + +
                            +
                            +
                            +template<typename input_t >
                            + + + + + + + + + + + +
                            Tensor & _float_to_fused8bitrowwise_cpu_out_t (Tensor & output,
                            const Tensor & input )
                            +
                            + +
                            +
                            + +

                            ◆ _float_to_fused8bitrowwise_gpu_t()

                            + +
                            +
                            +
                            +template<typename input_t >
                            + + + + + + + +
                            Tensor _float_to_fused8bitrowwise_gpu_t (const Tensor & input)
                            +
                            + +
                            +
                            + +

                            ◆ _float_to_fusednbitrowwise_cpu()

                            + +
                            +
                            +
                            +template<typename input_t >
                            + + + + + + + + + + + +
                            Tensor _float_to_fusednbitrowwise_cpu (const Tensor & input,
                            const int64_t bit_rate )
                            +
                            + +
                            +
                            + +

                            ◆ _float_to_hfp8_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            at::Tensor _float_to_hfp8_cpu (const at::Tensor & input,
                            const int64_t ebits,
                            const int64_t exponent_bias,
                            const double max_pos )
                            +
                            + +
                            +
                            + +

                            ◆ _float_to_paddedFP8rowwise_gpu_t()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor _float_to_paddedFP8rowwise_gpu_t (const Tensor & input,
                            const bool forward,
                            const int64_t row_dim )
                            +
                            + +
                            +
                            + +

                            ◆ _FP8rowwise_to_float_gpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC at::Tensor _FP8rowwise_to_float_gpu (const at::Tensor & input,
                            bool forward,
                            const int64_t output_dtype )
                            +
                            + +
                            +
                            + +

                            ◆ _FP8rowwise_to_float_gpu_t()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor _FP8rowwise_to_float_gpu_t (const Tensor & input,
                            bool forward,
                            const int64_t output_dtype )
                            +
                            + +
                            +
                            + +

                            ◆ _fused8bitrowwise_to_float_cpu_out_t()

                            + +
                            +
                            +
                            +template<typename output_t >
                            + + + + + + + + + + + +
                            Tensor & _fused8bitrowwise_to_float_cpu_out_t (Tensor & output,
                            const Tensor & input )
                            +
                            + +
                            +
                            + +

                            ◆ _fused8bitrowwise_to_float_gpu()

                            + +
                            +
                            + + + + + + + +
                            DLL_PUBLIC at::Tensor _fused8bitrowwise_to_float_gpu (const at::Tensor & input)
                            +
                            + +
                            +
                            + +

                            ◆ _fused8bitrowwise_to_float_gpu_t()

                            + +
                            +
                            +
                            +template<typename output_t >
                            + + + + + + + +
                            Tensor _fused8bitrowwise_to_float_gpu_t (const Tensor & input)
                            +
                            + +
                            +
                            + +

                            ◆ _fused8bitrowwise_to_half_gpu()

                            + +
                            +
                            + + + + + + + +
                            DLL_PUBLIC at::Tensor _fused8bitrowwise_to_half_gpu (const at::Tensor & input)
                            +
                            + +
                            +
                            + +

                            ◆ _fusednbitrowwise_to_float_cpu()

                            + +
                            +
                            +
                            +template<typename output_t >
                            + + + + + + + + + + + +
                            Tensor _fusednbitrowwise_to_float_cpu (const Tensor & input,
                            const int64_t bit_rate )
                            +
                            + +
                            +
                            + +

                            ◆ _fusednbitrowwise_to_float_gpu()

                            + +
                            +
                            + + + + + + + + + + + +
                            DLL_PUBLIC at::Tensor _fusednbitrowwise_to_float_gpu (const at::Tensor & input,
                            const int64_t bit_rate )
                            +
                            + +
                            +
                            + +

                            ◆ _generic_histogram_binning_calibration_by_feature_cpu_kernel()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void _generic_histogram_binning_calibration_by_feature_cpu_kernel (const int64_t num_logits,
                            const int64_t num_bins,
                            const int64_t num_segments,
                            const double recalibrate_value,
                            const int64_t bin_ctr_in_use_after,
                            const double bin_ctr_weight_value,
                            const LogitType *const logit_data,
                            const SegmentValueType *const dense_segment_value_data,
                            const double *const bin_num_examples_data,
                            const double *const bin_num_positives_data,
                            const double *const bin_boundaries,
                            LogitType *const calibrated_prediction_data,
                            int64_t *const bin_ids_data )
                            +
                            + +
                            +
                            + +

                            ◆ _half_to_fused8bitrowwise_cpu_out()

                            + +
                            +
                            + + + + + + + + + + + +
                            Tensor & _half_to_fused8bitrowwise_cpu_out (Tensor & output,
                            const Tensor & input )
                            +
                            + +
                            +
                            + +

                            ◆ _half_to_fused8bitrowwise_gpu()

                            + +
                            +
                            + + + + + + + +
                            DLL_PUBLIC Tensor _half_to_fused8bitrowwise_gpu (const Tensor & input)
                            +
                            + +
                            +
                            + +

                            ◆ _hfp8_to_float_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            at::Tensor _hfp8_to_float_cpu (const at::Tensor & input,
                            const int64_t ebits,
                            const int64_t exponent_bias )
                            +
                            + +
                            +
                            + +

                            ◆ _histogram_binning_calibration_by_feature_cpu_kernel()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void _histogram_binning_calibration_by_feature_cpu_kernel (const int64_t num_logits,
                            const int64_t num_bins,
                            const int64_t num_segments,
                            const double recalibrate_value,
                            const double step,
                            const int64_t bin_ctr_in_use_after,
                            const double bin_ctr_weight_value,
                            const LogitType *const logit_data,
                            const SegmentValueType *const dense_segment_value_data,
                            const double *const bin_num_examples_data,
                            const double *const bin_num_positives_data,
                            LogitType *const calibrated_prediction_data,
                            int64_t *const bin_ids_data )
                            +
                            + +
                            +
                            + +

                            ◆ _histogram_binning_calibration_cpu_kernel()

                            + +
                            +
                            +
                            +template<typename T >
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void _histogram_binning_calibration_cpu_kernel (const int64_t num_logits,
                            const double recalibrate_value,
                            const double step,
                            const int64_t bin_ctr_in_use_after,
                            const double bin_ctr_weight_value,
                            const T *const logit_data,
                            const double *const bin_num_examples_data,
                            const double *const bin_num_positives_data,
                            T *const calibrated_prediction_data,
                            int64_t *const bin_ids_data )
                            +
                            + +
                            +
                            + +

                            ◆ _invert_permute_cpu_kernel()

                            + +
                            +
                            +
                            +template<typename index_t >
                            + + + + + + + + + + + + + + + + +
                            void _invert_permute_cpu_kernel (const int64_t permute_size,
                            const index_t *const __restrict__ permute,
                            index_t *const __restrict__ inversed_permute )
                            +
                            + +
                            +
                            + +

                            ◆ _paddedFP8rowwise_to_float_gpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC at::Tensor _paddedFP8rowwise_to_float_gpu (const at::Tensor & input,
                            const bool forward,
                            const int64_t row_dim,
                            const int64_t output_last_dim,
                            const int64_t output_dtype )
                            +
                            + +
                            +
                            + +

                            ◆ _paddedFP8rowwise_to_float_gpu_t()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor _paddedFP8rowwise_to_float_gpu_t (const Tensor & input,
                            const bool forward,
                            const int64_t row_dim,
                            const int64_t output_last_dim,
                            const int64_t output_dtype )
                            +
                            + +
                            +
                            + +

                            ◆ _permute_1D_indices_weights_kernel_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void _permute_1D_indices_weights_kernel_cpu (const offsets_t *const __restrict__ input_offsets,
                            const indices_t *const __restrict__ indices,
                            const weights_t *const __restrict__ weights,
                            const int64_t permuted_lengths_size,
                            const int32_t *const __restrict__ permute,
                            const offsets_t *const __restrict__ permuted_lengths,
                            const offsets_t *const __restrict__ output_offsets,
                            indices_t *const __restrict__ permuted_indices,
                            weights_t *const __restrict__ permuted_weights )
                            +
                            + +
                            +
                            + +

                            ◆ _permute_1D_lengths_cpu_kernel()

                            + +
                            +
                            +
                            +template<typename index_t >
                            + + + + + + + + + + + + + + + + + + + + + +
                            void _permute_1D_lengths_cpu_kernel (const index_t *const __restrict__ lengths,
                            int64_t permuted_lengths_size,
                            const int32_t *const __restrict__ permute,
                            index_t *const __restrict__ permuted_lengths )
                            +
                            + +
                            +
                            + +

                            ◆ _permute_2D_indices_weights_kernel_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void _permute_2D_indices_weights_kernel_cpu (const int32_t T,
                            const int32_t B,
                            const indices_t *const __restrict__ indices,
                            const weights_t *const __restrict__ weights,
                            const int32_t *const __restrict__ permute,
                            const offsets_t *const __restrict__ input_offsets,
                            const int64_t *const __restrict__ output_offsets_per_thread_cumsum,
                            indices_t *const __restrict__ permuted_indices,
                            weights_t *const __restrict__ permuted_weights,
                            const offsets_t *const __restrict__ permuted_lengths )
                            +
                            + +
                            +
                            + +

                            ◆ _permute_2D_lengths_cpu_kernel()

                            + +
                            +
                            +
                            +template<typename index_t >
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void _permute_2D_lengths_cpu_kernel (const int32_t T,
                            const int32_t B,
                            const index_t *const __restrict__ lengths,
                            int64_t lengths_size,
                            const int32_t *const __restrict__ permute,
                            index_t *const __restrict__ permuted_lengths,
                            index_t *const __restrict__ input_offsets,
                            int64_t *const __restrict__ output_offsets_per_thread_cumsum )
                            +
                            + +
                            +
                            + +

                            ◆ _permute_data_kernel_cpu()

                            + +
                            +
                            +
                            +template<bool has_weight, typename index_t , typename scalar_t >
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void _permute_data_kernel_cpu (const int32_t T,
                            const int32_t B,
                            const index_t *const __restrict__ indices,
                            const scalar_t *const __restrict__ weights,
                            const int32_t *const __restrict__ permute,
                            const index_t *const __restrict__ input_offsets,
                            const int64_t *const __restrict__ output_offsets_per_thread_cumsum,
                            index_t *const __restrict__ permuted_indices,
                            scalar_t *const __restrict__ permuted_weights,
                            const index_t *const __restrict__ permuted_lengths )
                            +
                            + +
                            +
                            + +

                            ◆ _permute_embeddings_kernel_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void _permute_embeddings_kernel_cpu (const int32_t T,
                            const int32_t B,
                            const scalar_t *const __restrict__ embeddings,
                            const int32_t *const __restrict__ permute,
                            const index_t *const __restrict__ input_offsets,
                            const int64_t *const __restrict__ output_offsets_per_thread_cumsum,
                            scalar_t *const __restrict__ permuted_embeddings,
                            const index_t *const __restrict__ permuted_lengths )
                            +
                            + +
                            +
                            + +

                            ◆ _permute_lengths_cpu_kernel()

                            + +
                            +
                            +
                            +template<typename index_t >
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void _permute_lengths_cpu_kernel (const int32_t T,
                            const int32_t B,
                            const index_t *const __restrict__ lengths,
                            int64_t lengths_size,
                            const int32_t *const __restrict__ permute,
                            index_t *const __restrict__ permuted_lengths,
                            index_t *const __restrict__ input_offsets,
                            int64_t *const __restrict__ output_offsets_per_thread_cumsum )
                            +
                            + +
                            +
                            + +

                            ◆ _segment_sum_csr_cpu_kernel()

                            + +
                            +
                            +
                            +template<typename scalar_t >
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void _segment_sum_csr_cpu_kernel (const int num_segments,
                            const int batch_size,
                            const int *const csr_seg_data,
                            const scalar_t *const values_data,
                            scalar_t *const output_data )
                            +
                            + +
                            +
                            + +

                            ◆ accumulate_fp16()

                            + +
                            +
                            + + + + + + + + + + + +
                            __forceinline__ __device__ float2 accumulate_fp16 (float2 acc,
                            __half2 vals )
                            +
                            + +
                            +
                            + +

                            ◆ accumulate_fp32()

                            + +
                            +
                            + + + + + + + + + + + +
                            __forceinline__ __device__ float accumulate_fp32 (float acc,
                            float vals )
                            +
                            + +
                            +
                            + +

                            ◆ accumulate_packed_hfp8()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            __forceinline__ __device__ float4 accumulate_packed_hfp8 (float4 acc,
                            uint32_t packedVals,
                            int exp_bits,
                            int exp_bias )
                            +
                            + +
                            +
                            + +

                            ◆ accumulate_packed_int2()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            __forceinline__ __device__ float_16 accumulate_packed_int2 (float_16 acc,
                            uint32_t packedVals,
                            __half2 shift_scale )
                            +
                            + +
                            +
                            + +

                            ◆ accumulate_packed_int4()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            __forceinline__ __device__ float8 accumulate_packed_int4 (float8 acc,
                            uint32_t packedVals,
                            __half2 shift_scale )
                            +
                            + +
                            +
                            + +

                            ◆ accumulate_packed_int8()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            __forceinline__ __device__ float4 accumulate_packed_int8 (float4 acc,
                            uint32_t packedVals,
                            __half2 shift_scale )
                            +
                            + +
                            +
                            + +

                            ◆ accumulate_weighted_fp16()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            __forceinline__ __device__ float2 accumulate_weighted_fp16 (float2 acc,
                            __half2 vals,
                            float weight )
                            +
                            + +
                            +
                            + +

                            ◆ accumulate_weighted_fp32()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            __forceinline__ __device__ float accumulate_weighted_fp32 (float acc,
                            float vals,
                            float weight )
                            +
                            + +
                            +
                            + +

                            ◆ accumulate_weighted_packed_hfp8()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            __forceinline__ __device__ float4 accumulate_weighted_packed_hfp8 (float4 acc,
                            uint32_t packedVals,
                            int exp_bits,
                            int exp_bias,
                            float weight )
                            +
                            + +
                            +
                            + +

                            ◆ accumulate_weighted_packed_int2()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            __forceinline__ __device__ float_16 accumulate_weighted_packed_int2 (float_16 acc,
                            uint32_t packedVals,
                            __half2 shift_scale,
                            float weight )
                            +
                            + +
                            +
                            + +

                            ◆ accumulate_weighted_packed_int4()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            __forceinline__ __device__ float8 accumulate_weighted_packed_int4 (float8 acc,
                            uint32_t packedVals,
                            __half2 shift_scale,
                            float weight )
                            +
                            + +
                            +
                            + +

                            ◆ accumulate_weighted_packed_int8()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            __forceinline__ __device__ float4 accumulate_weighted_packed_int8 (float4 acc,
                            uint32_t packedVals,
                            __half2 shift_scale,
                            float weight )
                            +
                            + +
                            +
                            + +

                            ◆ assign()

                            + +
                            +
                            +
                            +template<typename T >
                            + + + + + +
                            + + + + + + + + + + + + + + + + +
                            __device__ void assign (bool assign,
                            T & x,
                            T y )
                            +
                            +inline
                            +
                            + +
                            +
                            + +

                            ◆ asynchronous_complete_cumsum_cpu()

                            + +
                            +
                            + + + + + + + +
                            Tensor asynchronous_complete_cumsum_cpu (const Tensor & t_in)
                            +
                            + +
                            +
                            + +

                            ◆ asynchronous_complete_cumsum_gpu()

                            + +
                            +
                            + + + + + + + +
                            DLL_PUBLIC Tensor asynchronous_complete_cumsum_gpu (const Tensor & t_in)
                            +
                            + +
                            +
                            + +

                            ◆ asynchronous_complete_cumsum_meta()

                            + +
                            +
                            + + + + + + + +
                            Tensor asynchronous_complete_cumsum_meta (const Tensor & t_in)
                            +
                            + +
                            +
                            + +

                            ◆ asynchronous_exclusive_cumsum_cpu()

                            + +
                            +
                            + + + + + + + +
                            Tensor asynchronous_exclusive_cumsum_cpu (const Tensor & t_in)
                            +
                            + +
                            +
                            + +

                            ◆ asynchronous_exclusive_cumsum_gpu()

                            + +
                            +
                            + + + + + + + +
                            DLL_PUBLIC Tensor asynchronous_exclusive_cumsum_gpu (const Tensor & t_in)
                            +
                            + +
                            +
                            + +

                            ◆ asynchronous_exclusive_cumsum_meta()

                            + +
                            +
                            + + + + + + + +
                            Tensor asynchronous_exclusive_cumsum_meta (const Tensor & t_in)
                            +
                            + +
                            +
                            + +

                            ◆ asynchronous_inclusive_cumsum_cpu()

                            + +
                            +
                            + + + + + + + +
                            Tensor asynchronous_inclusive_cumsum_cpu (const Tensor & t_in)
                            +
                            + +
                            +
                            + +

                            ◆ asynchronous_inclusive_cumsum_gpu()

                            + +
                            +
                            + + + + + + + +
                            DLL_PUBLIC Tensor asynchronous_inclusive_cumsum_gpu (const Tensor & t_in)
                            +
                            + +
                            +
                            + +

                            ◆ auc_kernel()

                            + +
                            +
                            +
                            +template<typename index_t , typename label_t , typename weight_t , typename acc_t , int PADDED_SECTION_SIZE>
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            __global__ void auc_kernel (acc_t * output,
                            const index_t * indices,
                            const label_t * labels,
                            const weight_t * weights,
                            int * block_flags,
                            acc_t * block_sums,
                            const int num_entries,
                            const int last_block_num_entries,
                            const int padded_num_entries_per_block,
                            const int num_blocks )
                            +
                            + +
                            +
                            + +

                            ◆ ballot_sync()

                            + +
                            +
                            + + + + + + + + + + + +
                            DEVICE_INLINE uint64_t ballot_sync (int predicate,
                            unsigned shfl_sync_mask = kFullWarpMask )
                            +
                            + +
                            +
                            + +

                            ◆ batch_auc()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            at::Tensor batch_auc (const int64_t num_tasks,
                            const at::Tensor & indices,
                            const at::Tensor & labels,
                            const at::Tensor & weights )
                            +
                            + +
                            +
                            + +

                            ◆ batched_dense_vec_jagged_2d_mul_backward()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor > batched_dense_vec_jagged_2d_mul_backward (const Tensor & grad_output,
                            const Tensor & v,
                            const Tensor & a_values,
                            const Tensor & a_offsets )
                            +
                            + +
                            +
                            + +

                            ◆ batched_dense_vec_jagged_2d_mul_backward_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor > batched_dense_vec_jagged_2d_mul_backward_meta (const Tensor & grad_output,
                            const Tensor & v,
                            const Tensor & a_values,
                            const Tensor & a_offsets )
                            +
                            + +
                            +
                            + +

                            ◆ batched_dense_vec_jagged_2d_mul_forward()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor batched_dense_vec_jagged_2d_mul_forward (const Tensor & v,
                            const Tensor & a_values,
                            const Tensor & a_offsets )
                            +
                            + +
                            +
                            + +

                            ◆ batched_dense_vec_jagged_2d_mul_forward_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor batched_dense_vec_jagged_2d_mul_forward_meta (const Tensor & v,
                            const Tensor & a_values,
                            const Tensor & a_offsets )
                            +
                            + +
                            +
                            + +

                            ◆ batched_unary_embeddings_backward_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC Tensor batched_unary_embeddings_backward_cuda (const Tensor & grad_output,
                            const Tensor & weight,
                            const Tensor & table_offsets,
                            const Tensor & offsets,
                            const Tensor & indices )
                            +
                            + +
                            +
                            + +

                            ◆ batched_unary_embeddings_forward_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor batched_unary_embeddings_forward_cpu (const Tensor & weight,
                            const Tensor & table_offsets,
                            const Tensor & offsets,
                            const Tensor & indices )
                            +
                            +

                            CPU version of batched_unary_embeddings forward pass.

                            +

                            Sums up weight embeddings according to offsets and indices. table_offests is a helper struct to quickly navigate through tables in weight – it is caller's responsibility to keep it in sync with weight. Visualization of op semantics: https://fburl.com/9a4uktmb

                            +

                            This version is only for numerical verification so not optimized for performance.

                            +
                            Parameters
                            + + + + + +
                            weight- Weight for the embeddings.
                            table_offsets- Index offsets for each table entry in weight.
                            offsets- Offsets for the starting point of each summation.
                            indices- Indices for the embeddings to fetch (from weight).
                            +
                            +
                            +
                            Returns
                            The sumed embeddings.
                            + +
                            +
                            + +

                            ◆ batched_unary_embeddings_forward_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor batched_unary_embeddings_forward_cuda (const Tensor & weight,
                            const Tensor & table_offsets,
                            const Tensor & offsets,
                            const Tensor & indices )
                            +
                            + +
                            +
                            + +

                            ◆ BFloat16QuantizedToFloat_ref()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            void BFloat16QuantizedToFloat_ref (const at::BFloat16 *const input,
                            const size_t numel,
                            float *const output )
                            +
                            + +
                            +
                            + +

                            ◆ binary_search_range()

                            + +
                            +
                            +
                            +template<typename scalar_t >
                            + + + + + + + + + + + + + + + + + + + + + +
                            __device__ __forceinline__ void binary_search_range (int * found,
                            const scalar_t * arr,
                            const scalar_t target,
                            const int num_entries )
                            +
                            + +
                            +
                            + +

                            ◆ block_bucketize_sparse_features_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor, c10::optional< Tensor >, c10::optional< Tensor >, c10::optional< Tensor > > block_bucketize_sparse_features_cpu (const Tensor & lengths,
                            const Tensor & indices,
                            const bool bucketize_pos,
                            const bool sequence,
                            const Tensor & block_sizes,
                            const int64_t my_size,
                            const c10::optional< Tensor > & weights,
                            const c10::optional< Tensor > & batch_size_per_feature,
                            const int64_t ,
                            const c10::optional< std::vector< at::Tensor > > & block_bucketize_pos )
                            +
                            + +
                            +
                            + +

                            ◆ block_bucketize_sparse_features_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC std::tuple< Tensor, Tensor, c10::optional< Tensor >, c10::optional< Tensor >, c10::optional< Tensor > > block_bucketize_sparse_features_cuda (const Tensor & lengths,
                            const Tensor & indices,
                            const bool bucketize_pos,
                            const bool sequence,
                            const Tensor & block_sizes,
                            const int64_t my_size,
                            const c10::optional< Tensor > & weights,
                            const c10::optional< Tensor > & batch_size_per_feature,
                            const int64_t max_B,
                            const c10::optional< std::vector< at::Tensor > > & block_bucketize_pos )
                            +
                            + +
                            +
                            + +

                            ◆ bucketize_sparse_features_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< at::Tensor, at::Tensor, c10::optional< at::Tensor >, c10::optional< at::Tensor > > bucketize_sparse_features_cpu (const at::Tensor & lengths,
                            const at::Tensor & indices,
                            const bool bucketize_pos,
                            const int64_t my_size,
                            const c10::optional< at::Tensor > & weights )
                            +
                            + +
                            +
                            + +

                            ◆ bucketize_sparse_features_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC std::tuple< Tensor, Tensor, c10::optional< Tensor >, c10::optional< Tensor > > bucketize_sparse_features_cuda (const Tensor & lengths,
                            const Tensor & indices,
                            const bool bucketize_pos,
                            const int64_t my_size,
                            const c10::optional< Tensor > & weights )
                            +
                            + +
                            +
                            + +

                            ◆ calc_offsets_range_thread_block()

                            + +
                            +
                            + + + + + + + + + + + +
                            std::tuple< uint32_t, uint32_t, uint32_t > calc_offsets_range_thread_block (const int64_t output_size,
                            const int64_t num_seq )
                            +
                            + +
                            +
                            + +

                            ◆ cat_reorder_batched_ad_indices_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor cat_reorder_batched_ad_indices_cpu (const Tensor & cat_ad_offsets,
                            const std::vector< Tensor > & ad_indices,
                            const Tensor & reordered_cat_ad_offsets,
                            const Tensor & batch_offsets,
                            const int64_t num_ads_in_batch,
                            const bool broadcast_indices,
                            const int64_t total_num_indices,
                            const bool pinned_memory )
                            +
                            + +
                            +
                            + +

                            ◆ cat_reorder_batched_ad_indices_cpu_()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void cat_reorder_batched_ad_indices_cpu_ (const Tensor & cat_ad_offsets,
                            const std::vector< Tensor > & ad_indices,
                            const Tensor & reordered_cat_ad_offsets,
                            const Tensor & batch_offsets,
                            const int64_t num_ads_in_batch,
                            const bool broadcast_indices,
                            Tensor & output )
                            +
                            + +
                            +
                            + +

                            ◆ compute_frequency_sequence()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC void compute_frequency_sequence (const Tensor & input,
                            Tensor & output,
                            const int start_input,
                            const int output_size )
                            +
                            + +
                            +
                            + +

                            ◆ compute_num_uint64s()

                            + +
                            +
                            +
                            +template<typename T >
                            + + + + + + + +
                            uint64_t compute_num_uint64s (const uint64_t num_elements)
                            +
                            + +
                            +
                            + +

                            ◆ CUDA_KERNEL_LOOP() [1/2]

                            + +
                            +
                            + + + + + + + + + + + +
                            CUDA_KERNEL_LOOP (b_t ,
                            lengths_size  )
                            +
                            + +
                            +
                            + +

                            ◆ CUDA_KERNEL_LOOP() [2/2]

                            + +
                            +
                            + + + + + + + + + + + +
                            CUDA_KERNEL_LOOP (r ,
                            lengths_size  )
                            +
                            + +
                            +
                            + +

                            ◆ dense_to_jagged_forward()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor dense_to_jagged_forward (const Tensor & dense,
                            const std::vector< Tensor > & offsets,
                            c10::optional< at::SymInt > total_L )
                            +
                            + +
                            +
                            + +

                            ◆ dequantize_load() [1/3]

                            + +
                            +
                            +
                            +template<typename dst_t , typename src_t >
                            + + + + + + + + + + + +
                            DEVICE_INLINE Vec4T< dst_t > dequantize_load (const src_t * value,
                            const float2  )
                            +
                            + +
                            +
                            + +

                            ◆ dequantize_load() [2/3]

                            + +
                            +
                            +
                            +template<>
                            + + + + + + + + + + + +
                            DEVICE_INLINE Vec4T< float > dequantize_load (const uint8_t * value,
                            const float2 qparams )
                            +
                            + +
                            +
                            + +

                            ◆ dequantize_load() [3/3]

                            + +
                            +
                            +
                            +template<>
                            + + + + + + + + + + + +
                            DEVICE_INLINE Vec4T< at::Half > dequantize_load (const uint8_t * value,
                            const float2 qparams )
                            +
                            + +
                            +
                            + +

                            ◆ dequantize_packed_hfp8()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            __forceinline__ __device__ float4 dequantize_packed_hfp8 (uint32_t vals,
                            int exp_bits,
                            int exp_bias )
                            +
                            + +
                            +
                            + +

                            ◆ dequantize_permuted_int2()

                            + +
                            +
                            + + + + + + + + + + + +
                            __forceinline__ __device__ half16 dequantize_permuted_int2 (uint32_t packedVals,
                            __half2 shift_scale )
                            +
                            + +
                            +
                            + +

                            ◆ dequantize_permuted_int4()

                            + +
                            +
                            + + + + + + + + + + + +
                            __forceinline__ __device__ half8 dequantize_permuted_int4 (uint32_t packedVals,
                            __half2 shift_scale )
                            +
                            + +
                            +
                            + +

                            ◆ dequantize_permuted_int8()

                            + +
                            +
                            + + + + + + + + + + + +
                            __forceinline__ __device__ half4 dequantize_permuted_int8 (uint32_t packedVals,
                            __half2 shift_scale )
                            +
                            + +
                            +
                            + +

                            ◆ direct_mapped_lru_cache_populate_byte_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC void direct_mapped_lru_cache_populate_byte_cpu (Tensor weights,
                            Tensor cache_hash_size_cumsum,
                            int64_t total_cache_hash_size,
                            Tensor cache_index_table_map,
                            Tensor weights_offsets,
                            Tensor weights_tys,
                            Tensor D_offsets,
                            Tensor linear_cache_indices,
                            Tensor lxu_cache_state,
                            Tensor lxu_cache_weights,
                            int64_t time_stamp,
                            Tensor lru_state,
                            Tensor lxu_cache_miss_timestamp,
                            int64_t row_alignment,
                            bool gather_cache_stats,
                            c10::optional< Tensor > uvm_cache_stats )
                            +
                            + +
                            +
                            + +

                            ◆ direct_mapped_lxu_cache_lookup_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC Tensor direct_mapped_lxu_cache_lookup_cpu (Tensor linear_cache_indices,
                            Tensor lxu_cache_state,
                            int64_t invalid_index,
                            bool gather_cache_stats,
                            c10::optional< Tensor > uvm_cache_stats )
                            +
                            + +
                            +
                            + +

                            ◆ div_round_up()

                            + +
                            +
                            + + + + + + + + + + + +
                            __host__ DEVICE_INLINE int32_t div_round_up (int32_t a,
                            int32_t b )
                            +
                            + +
                            +
                            + +

                            ◆ DivMod()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            fd_num_warps_per_list DivMod (global_warp_id ,
                            reinterpret_cast< int32_t * > & list_id,
                            reinterpret_cast< int32_t * > & warp_id )
                            +
                            + +
                            +
                            + +

                            ◆ dummy_packed_accessor32()

                            + +
                            +
                            +
                            +template<typename scalar_t , int ndim, template< typename U > class PtrTraits = at::DefaultPtrTraits>
                            + + + + + + + +
                            at::PackedTensorAccessor32< scalar_t, ndim, PtrTraits > dummy_packed_accessor32 ()
                            +
                            + +
                            +
                            + +

                            ◆ dummy_packed_accessor64()

                            + +
                            +
                            +
                            +template<typename scalar_t , int ndim, template< typename U > class PtrTraits = at::DefaultPtrTraits>
                            + + + + + + + +
                            at::PackedTensorAccessor64< scalar_t, ndim, PtrTraits > dummy_packed_accessor64 ()
                            +
                            + +
                            +
                            + +

                            ◆ embedding_bag_rowwise_prune()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor > embedding_bag_rowwise_prune (const Tensor & weights,
                            const Tensor & indicator,
                            const double threshold,
                            at::ScalarType compressed_indices_dtype,
                            const bool abs,
                            const int64_t min_non_pruned_rows,
                            const c10::optional< double > & min_save_ratio )
                            +
                            + +
                            +
                            + +

                            ◆ embedding_inplace_update_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void embedding_inplace_update_cpu (Tensor dev_weights,
                            Tensor uvm_weights,
                            Tensor weights_placements,
                            Tensor weights_offsets,
                            Tensor weights_tys,
                            Tensor D_offsets,
                            Tensor update_weights,
                            Tensor update_table_idx,
                            Tensor update_row_idx,
                            Tensor update_offsets,
                            const int64_t row_alignment,
                            c10::optional< Tensor > lxu_cache_weights = c10::nullopt,
                            c10::optional< Tensor > lxu_cache_locations = c10::nullopt )
                            +
                            + +
                            +
                            + +

                            ◆ embedding_inplace_update_cpu_kernel()

                            + +
                            +
                            +
                            +template<typename index_t >
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void embedding_inplace_update_cpu_kernel (at::TensorAccessor< uint8_t, 1 > dev_weights,
                            at::TensorAccessor< uint8_t, 1 > uvm_weights,
                            const at::TensorAccessor< int32_t, 1 > & weights_placements,
                            const at::TensorAccessor< int64_t, 1 > & weights_offsets,
                            const at::TensorAccessor< uint8_t, 1 > & weights_tys,
                            const at::TensorAccessor< int32_t, 1 > & D_offsets,
                            const at::TensorAccessor< uint8_t, 1 > & update_weights,
                            const at::TensorAccessor< int32_t, 1 > & update_table_idx,
                            const at::TensorAccessor< index_t, 1 > & update_row_idx,
                            const at::TensorAccessor< int64_t, 1 > & update_offsets,
                            int64_t row_alignment )
                            +
                            + +
                            +
                            + +

                            ◆ embedding_inplace_update_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void embedding_inplace_update_cuda (Tensor dev_weights,
                            Tensor uvm_weights,
                            Tensor weights_placements,
                            Tensor weights_offsets,
                            Tensor weights_tys,
                            Tensor D_offsets,
                            Tensor update_weights,
                            Tensor update_table_idx,
                            Tensor update_row_idx,
                            Tensor update_offsets,
                            const int64_t row_alignment,
                            c10::optional< Tensor > lxu_cache_weights = c10::nullopt,
                            c10::optional< Tensor > lxu_cache_locations = c10::nullopt )
                            +
                            +

                            Embedding tables inplace updates with absolute values (idempotent guarantee)

                            +

                            dev_weights: the loaded tables on device in TBE format uvm_weights: the loaded tables on UVM in TBE format weights_placements: placements for each table weights_offsets: physical offsets for each table weights_tys: weight types for each table D_offsets: table dimensions update_weights: new update weights tensor in TBE format update_table_idx: table indices for every new row update_row_idx: row indices for every new row update_offsets: offsets of new update weights row_alignment: alignment byte for embedding row lxu_cache_weights: the loaded cache weights lxu_cache_locations: the loaded cache location info

                            +

                            it's guaranteed from upper service level that each row of table will only receive one update at a time.

                            +

                            This function has embedding update parameters (update_weights, update_table_idx, updata_offsets) and delta embedding weights on the CUDA devices.

                            + +
                            +
                            + +

                            ◆ exclusive_scan_ptrs_cpu()

                            + +
                            +
                            +
                            +template<class T , class U >
                            + + + + + + + + + + + + + + + + +
                            U exclusive_scan_ptrs_cpu (const int64_t N,
                            const T *const input,
                            U *const output )
                            +
                            + +
                            +
                            + +

                            ◆ expand_into_jagged_permute_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor expand_into_jagged_permute_cpu (const Tensor & permute,
                            const Tensor & input_offsets,
                            const Tensor & output_offsets,
                            int64_t output_size )
                            +
                            + +
                            +
                            + +

                            ◆ FBGEMM_GPU_ENUM_REGISTER_START()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            FBGEMM_GPU_ENUM_REGISTER_START (uvm ,
                            cudaMemory ,
                            Advise  )
                            +
                            + +
                            +
                            + +

                            ◆ float16_max()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE float float16_max (float_16 val)
                            +
                            + +
                            +
                            + +

                            ◆ float16_min()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE float float16_min (float_16 val)
                            +
                            + +
                            +
                            + +

                            ◆ float1_max()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE float float1_max (float val)
                            +
                            + +
                            +
                            + +

                            ◆ float1_min()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE float float1_min (float val)
                            +
                            + +
                            +
                            + +

                            ◆ float2_max()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE float float2_max (float2 val)
                            +
                            + +
                            +
                            + +

                            ◆ float2_min()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE float float2_min (float2 val)
                            +
                            + +
                            +
                            + +

                            ◆ float4_max()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE float float4_max (float4 val)
                            +
                            + +
                            +
                            + +

                            ◆ float4_min()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE float float4_min (float4 val)
                            +
                            + +
                            +
                            + +

                            ◆ float8_max()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE float float8_max (float8 val)
                            +
                            + +
                            +
                            + +

                            ◆ float8_min()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE float float8_min (float8 val)
                            +
                            + +
                            +
                            + +

                            ◆ float_or_half_to_fusednbitrowwise_cpu()

                            + +
                            +
                            + + + + + + + + + + + +
                            Tensor float_or_half_to_fusednbitrowwise_cpu (const Tensor & input,
                            const int64_t bit_rate )
                            +
                            + +
                            +
                            + +

                            ◆ float_to_fusednbitrowwise_cpu()

                            + +
                            +
                            + + + + + + + + + + + +
                            Tensor float_to_fusednbitrowwise_cpu (const Tensor & input,
                            const int64_t bit_rate )
                            +
                            + +
                            +
                            + +

                            ◆ float_to_hfp8()

                            + +
                            +
                            + + + + + +
                            + + + + + + + + + + + + + + + + + + + + + +
                            C10_HOST_DEVICE uint8_t float_to_hfp8 (float val_fp,
                            int ebits,
                            int exponent_bias,
                            float max_pos )
                            +
                            +inline
                            +
                            + +
                            +
                            + +

                            ◆ FloatToBFloat16Quantized_ref()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            void FloatToBFloat16Quantized_ref (const float *const input,
                            const size_t numel,
                            uint16_t *const output )
                            +
                            + +
                            +
                            + +

                            ◆ FloatToFP8RowwiseQuantized_meta()

                            + +
                            +
                            + + + + + + + + + + + +
                            Tensor FloatToFP8RowwiseQuantized_meta (const Tensor & input,
                            bool forward )
                            +
                            + +
                            +
                            + +

                            ◆ for()

                            + +
                            +
                            + + + + + + + +
                            for ()
                            +
                            + +
                            +
                            + +

                            ◆ FP8rowwise_to_float_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor FP8rowwise_to_float_meta (const Tensor & input,
                            bool forward,
                            const int64_t output_dtype )
                            +
                            + +
                            +
                            + +

                            ◆ fused8bitrowwise_to_half_cpu_out()

                            + +
                            +
                            + + + + + + + + + + + +
                            Tensor & fused8bitrowwise_to_half_cpu_out (Tensor & output,
                            const Tensor & input )
                            +
                            + +
                            +
                            + +

                            ◆ generic_histogram_binning_calibration_by_feature_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor > generic_histogram_binning_calibration_by_feature_cuda (const Tensor & logit,
                            const Tensor & segment_value,
                            const Tensor & segment_lengths,
                            int64_t num_segments,
                            const Tensor & bin_num_examples,
                            const Tensor & bin_num_positives,
                            const Tensor & bin_boundaries,
                            double positive_weight,
                            int64_t bin_ctr_in_use_after,
                            double bin_ctr_weight_value )
                            +
                            + +
                            +
                            + +

                            ◆ get_group_index_select_cols_per_warp()

                            + +
                            +
                            + + + + + + + +
                            int get_group_index_select_cols_per_warp ()
                            +
                            + +
                            +
                            + +

                            ◆ get_nvlink_matrix()

                            + +
                            +
                            + + + + + + + +
                            AdjacencyMatrix< Links > get_nvlink_matrix ()
                            +
                            + +
                            +
                            + +

                            ◆ getScalarType()

                            + +
                            +
                            + + + + + +
                            + + + + + + + +
                            at::ScalarType getScalarType (SparseType dtype)
                            +
                            +inline
                            +
                            + +
                            +
                            + +

                            ◆ getSparseType()

                            + +
                            +
                            + + + + + +
                            + + + + + + + +
                            SparseType getSparseType (at::ScalarType dtype)
                            +
                            +inline
                            +
                            + +
                            +
                            + +

                            ◆ group_index_select_dim0_gpu()

                            + +
                            +
                            + + + + + + + + + + + +
                            torch::autograd::variable_list group_index_select_dim0_gpu (at::TensorList input_group,
                            at::TensorList indices_group )
                            +
                            + +
                            +
                            + +

                            ◆ group_index_select_dim0_gpu_backward_meta()

                            + +
                            +
                            + + + + + + + + + + + +
                            torch::autograd::variable_list group_index_select_dim0_gpu_backward_meta (at::TensorList all_inputs,
                            c10::SymIntArrayRef output_shape_group_ref )
                            +
                            + +
                            +
                            + +

                            ◆ group_index_select_dim0_gpu_impl()

                            + +
                            +
                            + + + + + + + + + + + +
                            torch::autograd::variable_list group_index_select_dim0_gpu_impl (at::TensorList all_indices_input,
                            const int64_t group_size )
                            +
                            + +
                            +
                            + +

                            ◆ group_index_select_dim0_gpu_impl_meta()

                            + +
                            +
                            + + + + + + + + + + + +
                            torch::autograd::variable_list group_index_select_dim0_gpu_impl_meta (at::TensorList all_indices_input,
                            const int64_t group_size )
                            +
                            + +
                            +
                            + +

                            ◆ group_index_select_dim0_unpack()

                            + +
                            +
                            + + + + + + + + + + + +
                            std::pair< std::vector< Tensor >, std::vector< Tensor > > group_index_select_dim0_unpack (at::TensorList all_indices_input,
                            const int64_t group_size )
                            +
                            + +
                            +
                            + +

                            ◆ group_index_select_or_add_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC void group_index_select_or_add_cuda (const int64_t * input_ptrs,
                            const int64_t * output_ptrs,
                            const int64_t * indices_ptrs,
                            const int64_t * warp_offsets_group,
                            const int32_t * num_cols_group,
                            const c10::ScalarType & input_scalar_type,
                            const c10::ScalarType & indices_scalar_type,
                            const c10::DeviceIndex & device,
                            const int num_work_rows,
                            const int64_t total_num_warps,
                            const int group_size,
                            const bool use_index_select,
                            const bool use_var_cols )
                            +
                            + +
                            +
                            + +

                            ◆ half_to_fusednbitrowwise_cpu()

                            + +
                            +
                            + + + + + + + + + + + +
                            Tensor half_to_fusednbitrowwise_cpu (const Tensor & input,
                            const int64_t bit_rate )
                            +
                            + +
                            +
                            + +

                            ◆ hfma2()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            __forceinline__ __device__ __half2 hfma2 (const __half2 a,
                            const __half2 b,
                            const __half2 c )
                            +
                            + +
                            +
                            + +

                            ◆ hfp8_to_float()

                            + +
                            +
                            + + + + + +
                            + + + + + + + + + + + + + + + + +
                            C10_HOST_DEVICE float hfp8_to_float (uint8_t hfp8_val,
                            int ebits,
                            int exponent_bias )
                            +
                            +inline
                            +
                            + +
                            +
                            + +

                            ◆ histogram_binning_calibration_by_feature_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor > histogram_binning_calibration_by_feature_cpu (const Tensor & logit,
                            const Tensor & segment_value,
                            const Tensor & segment_lengths,
                            int64_t num_segments,
                            const Tensor & bin_num_examples,
                            const Tensor & bin_num_positives,
                            int64_t num_bins,
                            double positive_weight,
                            double lower_bound,
                            double upper_bound,
                            int64_t bin_ctr_in_use_after,
                            double bin_ctr_weight_value )
                            +
                            + +
                            +
                            + +

                            ◆ histogram_binning_calibration_by_feature_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor > histogram_binning_calibration_by_feature_cuda (const Tensor & logit,
                            const Tensor & segment_value,
                            const Tensor & segment_lengths,
                            int64_t num_segments,
                            const Tensor & bin_num_examples,
                            const Tensor & bin_num_positives,
                            int64_t num_bins,
                            double positive_weight,
                            double lower_bound,
                            double upper_bound,
                            int64_t bin_ctr_in_use_after,
                            double bin_ctr_weight_value )
                            +
                            + +
                            +
                            + +

                            ◆ histogram_binning_calibration_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor > histogram_binning_calibration_cuda (const Tensor & logit,
                            const Tensor & bin_num_examples,
                            const Tensor & bin_num_positives,
                            double positive_weight,
                            double lower_bound,
                            double upper_bound,
                            int64_t bin_ctr_in_use_after,
                            double bin_ctr_weight_value )
                            +
                            + +
                            +
                            + +

                            ◆ hmul()

                            + +
                            +
                            + + + + + + + + + + + +
                            __forceinline__ __device__ half hmul (half a,
                            half b )
                            +
                            + +
                            +
                            + +

                            ◆ hmul_short2()

                            + +
                            +
                            + + + + + + + + + + + +
                            __device__ __forceinline__ __half2 hmul_short2 (uint32_t lhs,
                            __half rhs )
                            +
                            + +
                            +
                            + +

                            ◆ if() [1/14]

                            + +
                            +
                            + + + + + + + +
                            if (b >= B)
                            +
                            + +
                            +
                            + +

                            ◆ if() [2/14]

                            + +
                            +
                            + + + + + + + + + + + +
                            if (curr_bin_num_examples ,
                            bin_ctr_in_use_after  )
                            +
                            + +
                            +
                            + +

                            ◆ if() [3/14]

                            + +
                            +
                            + + + + + + + +
                            if (i >= input_size)
                            +
                            + +
                            +
                            + +

                            ◆ if() [4/14]

                            + +
                            +
                            + + + + + + + +
                            if (index >=num_lengths - 1)
                            +
                            + +
                            +
                            + +

                            ◆ if() [5/14]

                            + +
                            +
                            + + + + + + + +
                            if (index >= num_logits)
                            +
                            + +
                            +
                            + +

                            ◆ if() [6/14]

                            + +
                            +
                            + + + + + + + +
                            if (list_id >= num_lists)
                            +
                            + +
                            +
                            + +

                            ◆ if() [7/14]

                            + +
                            +
                            + + + + + + + +
                            if (n >= N)
                            +
                            + +
                            +
                            + +

                            ◆ if() [8/14]

                            + +
                            +
                            + + + + + + + +
                            if (next_offset = curr_offset + 1)
                            +
                            + +
                            +
                            + +

                            ◆ if() [9/14]

                            + +
                            +
                            + + + + + + + +
                            if (per_sample_weights_addrs )
                            +
                            + +
                            +
                            + +

                            ◆ if() [10/14]

                            + +
                            +
                            + + + + + + + +
                            if (run_id >= sorted_linear_indices_num_runs[0])
                            +
                            + +
                            +
                            + +

                            ◆ if() [11/14]

                            + +
                            +
                            + + + + + + + +
                            if (run_id >=sorted_linear_indices_run. size0)
                            +
                            + +
                            +
                            + +

                            ◆ if() [12/14]

                            + +
                            +
                            + + + + + + + +
                            if (SL = = 0)
                            +
                            + +
                            +
                            + +

                            ◆ if() [13/14]

                            + +
                            +
                            + + + + + + + +
                            if (t >=T||b >= batch_size_per_feature[t])
                            +
                            + +
                            +
                            + +

                            ◆ if() [14/14]

                            + +
                            +
                            + + + + + + + +
                            if (threadIdx. x = = 0)
                            +
                            + +
                            +
                            + +

                            ◆ inclusive_sum_scan_kernel()

                            + +
                            +
                            +
                            +template<typename scalar_t , int ITEMS_PER_THREAD, int NUM_THREADS_PER_BLOCK>
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            __inline__ __device__ void inclusive_sum_scan_kernel (scalar_t(&) arr[ITEMS_PER_THREAD],
                            typename cub::BlockScan< scalar_t, NUM_THREADS_PER_BLOCK >::TempStorage & temp_storage,
                            int * block_flags,
                            volatile scalar_t * block_sums,
                            scalar_t * block_prev,
                            const int num_entries_per_block,
                            const int block_id,
                            const bool is_multi_block,
                            const int signal )
                            +
                            +

                            inclusive_sum_scan_kernel performs intra- and inter-thread block sum scan (i.e., prefix sum scan). We use cub::BlockScan to do inclusive sum within thread block and use a waterfall sync method to perform prefix sum across thread block.

                            +
                            Parameters
                            + + + + + + + + + + +
                            arran array of input values. Its length must be fixed to ITEMS_PER_THREAD
                            temp_storagea shared memory struct for cub::BlockScan
                            block_flagsa global flag buffer for inter-block sync (must be initialized with zeros)
                            block_sumsa global sum buffer for inter-block sync
                            block_preva shared memory pointer for sharing sum from the previous block within a block
                            num_entries_per_blocka number of input entries for this block
                            block_ida relative thread block ID (the first block that contains the first set of input entries has block_id = 0)
                            is_multi_blocka boolean to indicate if inter-block sum scan has to be performed
                            signalIf the value of block_flags of the previous block is equal to signal, it means that the previous block has written its sum to block_sums. We have thread blocks increment the value of block_flags by one after they write their sums to block_sums. We increment the flag instead of setting the flag to a single value to support multiple sequential inclusive_sum_scan_kernel calls (e.g., in the AUC kernel). signal is the order that inclusive_sum_scan_kernel is called. Since we intialize block_flags with zeros, the signal of the first call should be one.
                            +
                            +
                            + +
                            +
                            + +

                            ◆ index_add_with_unique_indices_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC Tensor index_add_with_unique_indices_cuda (const Tensor & grad_output,
                            const Tensor & sorted_indices,
                            const Tensor & orig_indices,
                            std::vector< int64_t > & input_shape,
                            const int consecutive_range_start,
                            const int consecutive_range_length )
                            +
                            + +
                            +
                            + +

                            ◆ index_select_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC Tensor index_select_cuda (const Tensor & input,
                            const Tensor & indices,
                            const Tensor & orig_indices,
                            const bool indices_sorted )
                            +
                            + +
                            +
                            + +

                            ◆ index_select_dim0_gpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor index_select_dim0_gpu (const Tensor & input,
                            const Tensor & indices,
                            c10::optional< int64_t > consecutive_range_start,
                            c10::optional< int64_t > consecutive_range_length,
                            c10::optional< bool > skip_indices_sorting_fwd )
                            +
                            + +
                            +
                            + +

                            ◆ index_select_scalar_cumsum_kernel()

                            + +
                            +
                            +
                            +template<typename scalar_t , typename index_t , typename acc_t , int NUM_THREADS_PER_BLOCK, int MAX_ENTRIES_PER_BLOCK>
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            __global__ void index_select_scalar_cumsum_kernel (at::PackedTensorAccessor32< scalar_t, 1, at::RestrictPtrTraits > output,
                            at::PackedTensorAccessor32< acc_t, 1, at::RestrictPtrTraits > output_cumsum,
                            const at::PackedTensorAccessor32< scalar_t, 1, at::RestrictPtrTraits > input,
                            const at::PackedTensorAccessor32< index_t, 1, at::RestrictPtrTraits > indices,
                            const int num_batches,
                            const int input_batch_size,
                            const int last_block_num_entries,
                            int * block_flags,
                            acc_t * block_sums )
                            +
                            + +
                            +
                            + +

                            ◆ invert_permute_cpu()

                            + +
                            +
                            + + + + + + + +
                            Tensor invert_permute_cpu (const Tensor & permute)
                            +
                            + +
                            +
                            + +

                            ◆ is_aligned()

                            + +
                            +
                            +
                            +template<class T >
                            + + + + + + + +
                            DEVICE_INLINE bool is_aligned (const void * ptr)
                            +
                            + +
                            +
                            + +

                            ◆ jagged_1d_to_dense_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_1d_to_dense_meta (Tensor values,
                            Tensor offsets,
                            c10::SymInt max_L,
                            int64_t padding_value )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_2d_to_dense_forward_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor jagged_2d_to_dense_forward_cpu (Tensor values,
                            Tensor offsets,
                            int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_2d_to_dense_gpu_backward()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor jagged_2d_to_dense_gpu_backward (Tensor grad_output,
                            at::Tensor offsets,
                            int64_t max_lengths )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_2d_to_dense_gpu_forward()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor jagged_2d_to_dense_gpu_forward (Tensor values,
                            Tensor offsets,
                            int64_t max_sequence_length )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_2d_to_dense_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor jagged_2d_to_dense_meta (Tensor values,
                            Tensor offsets,
                            c10::SymInt max_sequence_length )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_bmm()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor > jagged_dense_bmm (const Tensor & x_values,
                            const Tensor & x_offsets,
                            const Tensor & y,
                            const int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_bmm_forward()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_dense_bmm_forward (const Tensor & x_values,
                            const Tensor & x_offsets,
                            const Tensor & y,
                            const int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_bmm_forward_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_dense_bmm_forward_cuda (const Tensor & x_values,
                            const Tensor & x_offsets,
                            const Tensor & y,
                            const int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_bmm_forward_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_dense_bmm_forward_meta (const Tensor & x_values,
                            const Tensor & x_offsets,
                            const Tensor & y,
                            const int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_bmm_kernel()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void jagged_dense_bmm_kernel (const at::TensorAccessor< scalar_t, 2 > & x_values,
                            const at::TensorAccessor< index_t, 1 > & x_offsets,
                            const at::TensorAccessor< scalar_t, 3 > & y,
                            at::TensorAccessor< scalar_t, 2 > output,
                            const int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_dense_elementwise_add_jagged_output()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, std::vector< Tensor > > jagged_dense_dense_elementwise_add_jagged_output (const Tensor & x_values,
                            const std::vector< Tensor > & x_offsets,
                            const Tensor & y_0,
                            const Tensor & y_1 )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_dense_elementwise_add_jagged_output_forward()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_dense_dense_elementwise_add_jagged_output_forward (const Tensor & x_values,
                            const std::vector< Tensor > & offsets,
                            const Tensor & dense_0,
                            const Tensor & dense_1 )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_dense_elementwise_add_jagged_output_forward_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_dense_dense_elementwise_add_jagged_output_forward_meta (const at::Tensor & x_values,
                            const std::vector< at::Tensor > & x_offsets,
                            const at::Tensor & y_0,
                            const at::Tensor & y_1 )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_dense_elementwise_add_jagged_output_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, std::vector< Tensor > > jagged_dense_dense_elementwise_add_jagged_output_meta (const at::Tensor & x_values,
                            const std::vector< at::Tensor > & x_offsets,
                            const at::Tensor & y_0,
                            const at::Tensor &  )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_dense_elementwise_jagged_output_()

                            + +
                            +
                            +
                            +template<typename scalar_t , typename F >
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void jagged_dense_dense_elementwise_jagged_output_ (const Tensor & x_values,
                            const std::vector< Tensor > & x_offsets,
                            const Tensor & y_0,
                            const Tensor & y_1,
                            const Tensor & output_values,
                            F f )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_dense_elementwise_jagged_output_matches_opt()

                            + +
                            +
                            + + + + + +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            bool jagged_dense_dense_elementwise_jagged_output_matches_opt (const int & num_jagged_dim,
                            const Tensor & x_values,
                            const std::vector< Tensor > & x_offsets,
                            const Tensor & y_0_reshaped,
                            const Tensor & y_1_reshaped,
                            const Tensor & output_values )
                            +
                            +inline
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_dense_elementwise_jagged_output_opt_()

                            + +
                            +
                            +
                            +template<typename scalar_t , typename F >
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void jagged_dense_dense_elementwise_jagged_output_opt_ (const Tensor & x_values,
                            const std::vector< Tensor > & x_offsets,
                            const Tensor & y_0,
                            const Tensor & y_1,
                            const Tensor & output_values,
                            F f )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_elementwise_add_jagged_output_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, std::vector< Tensor > > jagged_dense_elementwise_add_jagged_output_meta (const at::Tensor & x_values,
                            const std::vector< at::Tensor > & x_offsets,
                            const at::Tensor &  )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_elementwise_add_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor jagged_dense_elementwise_add_meta (const Tensor & ,
                            const std::vector< Tensor > & x_offsets,
                            const Tensor & y )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_elementwise_jagged_output_()

                            + +
                            +
                            +
                            +template<typename scalar_t , typename F >
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void jagged_dense_elementwise_jagged_output_ (const Tensor & x_values,
                            const std::vector< Tensor > & x_offsets,
                            const Tensor & y,
                            const Tensor & output_values,
                            F f )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_elementwise_jagged_output_opt_()

                            + +
                            +
                            +
                            +template<typename scalar_t , typename F >
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void jagged_dense_elementwise_jagged_output_opt_ (const Tensor & x_values,
                            const std::vector< Tensor > & x_offsets,
                            const Tensor & y,
                            const Tensor & output_values,
                            F f )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_elementwise_mul_backward()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor > jagged_dense_elementwise_mul_backward (const Tensor & grad_output,
                            const std::vector< Tensor > & x_offsets,
                            const Tensor & y,
                            const Tensor & x_values )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_elementwise_mul_backward_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor > jagged_dense_elementwise_mul_backward_meta (const Tensor & grad_output,
                            const std::vector< Tensor > & x_offsets,
                            const Tensor & y,
                            const Tensor & x_values )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_elementwise_mul_forward()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor jagged_dense_elementwise_mul_forward (const Tensor & x_values,
                            const std::vector< Tensor > & x_offsets,
                            const Tensor & y )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_elementwise_mul_forward_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor jagged_dense_elementwise_mul_forward_meta (const Tensor & x_values,
                            const std::vector< Tensor > & x_offsets,
                            const Tensor & y )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_dense_elementwise_mul_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, std::vector< Tensor > > jagged_dense_elementwise_mul_meta (const Tensor & x_values,
                            const std::vector< Tensor > & x_offsets,
                            const Tensor &  )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_hash_size_cumsum_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor > jagged_hash_size_cumsum_cuda (const Tensor & offsets,
                            const Tensor & indices,
                            const int64_t batch_size )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_index_add_2d_forward_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_index_add_2d_forward_cpu (const Tensor & values,
                            const Tensor & indices,
                            const Tensor & input_offsets,
                            const Tensor & output_offsets,
                            const int64_t num_dense_input_rows,
                            const int64_t num_output_rows )
                            +
                            +

                            Add sequences from input jagged tensor to output jagged tensor based on indices specified in the indices tensor (this function invokes jagged_index_add_2d_kernel)

                            Parameters
                            + + + + + + + +
                            values2D dense value tensor of input jagged tensor
                            indices1D tensor that contains indices to be added in output jagged tensor
                            input_offsets1D tensor that contains offsets of input jagged tensor
                            output_offsets1D tensor that contains offsets of output jagged tensor
                            num_dense_input_rowsThe total number of rows in the 2D dense value tensor of input jagged tensor
                            num_output_rowsThe number of sequences in jagged output tensor
                            +
                            +
                            + +
                            +
                            + +

                            ◆ jagged_index_add_2d_forward_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_index_add_2d_forward_cuda (const Tensor & values,
                            const Tensor & indices,
                            const Tensor & input_offsets,
                            const Tensor & output_offsets,
                            const int64_t num_dense_input_rows,
                            const int64_t num_output_rows )
                            +
                            +

                            Add sequences from input jagged tensor to output jagged tensor based on indices specified in the indices tensor (host function for dispatching jagged_index_add_2d_kernel to GPU)

                            Parameters
                            + + + + + + + +
                            values2D dense value tensor of input jagged tensor
                            indices1D tensor that contains indices to be added in output jagged tensor
                            input_offsets1D tensor that contains offsets of input jagged tensor
                            output_offsets1D tensor that contains offsets of output jagged tensor
                            num_dense_input_rowsThe total number of rows in the 2D dense value tensor of input jagged tensor
                            num_output_rowsThe number of sequences in jagged output tensor
                            +
                            +
                            + +
                            +
                            + +

                            ◆ jagged_index_add_2d_forward_v2_impl()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_index_add_2d_forward_v2_impl (const Tensor & values,
                            const Tensor & indices,
                            const Tensor & input_offsets,
                            const Tensor & output_offsets,
                            const int64_t num_output_rows )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_index_add_2d_kernel()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void jagged_index_add_2d_kernel (at::TensorAccessor< scalar_t, 2 > output,
                            const at::TensorAccessor< scalar_t, 2 > & input,
                            const at::TensorAccessor< offset_t, 1 > & input_offsets,
                            const at::TensorAccessor< index_t, 1 > & indices,
                            const at::TensorAccessor< offset_t, 1 > & output_offsets )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_index_select_2d()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            std::vector< Tensor > jagged_index_select_2d (const Tensor & values,
                            const Tensor & lengths,
                            const Tensor & indices )
                            +
                            +

                            Call the autograd function of jagged_index_select_2d

                            +

                            Forward: Copy sequences from input jagged tensor based on indices specified in the indices tensor to output jagged tensor

                            +

                            Backward: Add sequences from output gradient jagged tensor to input gradient jagged tensor based on indices specified in the indices tensor

                            +
                            Parameters
                            + + + + +
                            values2D dense value of input jagged tensor
                            lengths1D tensor that contains sequence lengths of input jagged tensor
                            indices1D tensor that contains indices to be selected from input jagged tensor
                            +
                            +
                            + +
                            +
                            + +

                            ◆ jagged_index_select_2d_forward_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_index_select_2d_forward_cpu (const Tensor & values,
                            const Tensor & indices,
                            const Tensor & input_offsets,
                            const Tensor & output_offsets,
                            const int64_t num_dense_output_rows )
                            +
                            +

                            Copy sequences from input jagged tensor based on indices specified in the indices tensor to output jagged tensor (this function invokes jagged_index_select_2d_kernel)

                            Parameters
                            + + + + + + +
                            values2D dense value tensor of input jagged tensor
                            indices1D tensor that contains indices to be selected from input jagged tensor
                            input_offsets1D tensor that contains offsets of input jagged tensor
                            output_offsets1D tensor that contains offsets of output jagged tensor
                            num_dense_output_rowsThe total number of rows in the 2D dense value tensor of output jagged tensor
                            +
                            +
                            + +
                            +
                            + +

                            ◆ jagged_index_select_2d_forward_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_index_select_2d_forward_cuda (const Tensor & values,
                            const Tensor & indices,
                            const Tensor & input_offsets,
                            const Tensor & output_offsets,
                            const int64_t num_dense_output_rows )
                            +
                            +

                            Copy sequences from input jagged tensor based on indices specified in the indices tensor to an output jagged tensor (host function for dispatching jagged_index_select_2d_kernel to GPU)

                            Parameters
                            + + + + + + +
                            values2D dense value tensor of input jagged tensor
                            indices1D tensor that contains indices to be selected from output jagged tensor
                            input_offsets1D tensor that contains offsets of input jagged tensor
                            output_offsets1D tensor that contains offsets of output jagged tensor
                            num_dense_output_rowsThe total number of rows in the 2D dense value tensor of output jagged tensor
                            +
                            +
                            + +
                            +
                            + +

                            ◆ jagged_index_select_2d_forward_v2_impl()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_index_select_2d_forward_v2_impl (const Tensor & values,
                            const Tensor & indices,
                            const Tensor & input_offsets,
                            const Tensor & output_offsets )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_index_select_2d_kernel()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void jagged_index_select_2d_kernel (at::TensorAccessor< scalar_t, 2 > output,
                            const at::TensorAccessor< scalar_t, 2 > & input,
                            const at::TensorAccessor< offset_t, 1 > & input_offsets,
                            const at::TensorAccessor< index_t, 1 > & indices,
                            const at::TensorAccessor< offset_t, 1 > & output_offsets )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_jagged_bmm()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_jagged_bmm (const Tensor & x_values,
                            const Tensor & y_values,
                            const Tensor & offsets,
                            const int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_jagged_bmm_forward()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_jagged_bmm_forward (const Tensor & x_values,
                            const Tensor & y_values,
                            const Tensor & offsets,
                            const int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_jagged_bmm_forward_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_jagged_bmm_forward_cuda (const Tensor & x_values,
                            const Tensor & y_values,
                            const Tensor & offsets,
                            const int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_jagged_bmm_forward_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_jagged_bmm_forward_meta (const Tensor & x_values,
                            const Tensor & y_values,
                            const Tensor & offsets,
                            const int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_jagged_bmm_kernel()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void jagged_jagged_bmm_kernel (const at::TensorAccessor< scalar_t, 2 > & x_values,
                            const at::TensorAccessor< scalar_t, 2 > & y_values,
                            const at::TensorAccessor< index_t, 1 > & offsets,
                            at::TensorAccessor< scalar_t, 3 > output,
                            const int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_jagged_elementwise_dense_output_()

                            + +
                            +
                            +
                            +template<typename scalar_t , typename F >
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void jagged_jagged_elementwise_dense_output_ (const Tensor & x_values,
                            const std::vector< Tensor > & x_offsets,
                            const Tensor & y_values,
                            const Tensor & output,
                            F f,
                            const scalar_t padding_value = static_cast<scalar_t>(0) )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_slice()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor > jagged_slice (const Tensor & values,
                            const Tensor & lengths,
                            const Tensor & start,
                            const int64_t slice_length )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_slice_forward_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_slice_forward_cpu (const Tensor & x_values,
                            const Tensor & x_lengths,
                            const Tensor & src_start,
                            const Tensor & output_lengths,
                            const Tensor & tgt_start,
                            const int64_t num_output_rows,
                            const int64_t slice_length,
                            const bool fill_zeros )
                            +
                            +

                            Slice the jagged dim to max length from slice_length, from start point start. This is a jagged -> jagged op

                            Parameters
                            + + + + + + + + + +
                            x_values- X values of shape B * J_DIM where J_DIM is jagged dim
                            x_lengths- length along jagged dim
                            src_start- start of slice operation from the src tensor
                            output_lengths- length of jagged dim for output tensor
                            tgt_start- position to start filling in sliced values from source
                            num_output_rows- output dense dim
                            slice_length- length of jagged dim to slice
                            fill_zeros- option exists as an optimization, we can reuse the same code path for forward & backward. For backward we need to fill zeros in output tensor but fwd we don't.
                            +
                            +
                            + +
                            +
                            + +

                            ◆ jagged_slice_forward_cpu_kernel()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void jagged_slice_forward_cpu_kernel (at::TensorAccessor< scalar_t, 1 > output,
                            const at::TensorAccessor< offset_t, 1 > & output_lengths,
                            const at::TensorAccessor< offset_t, 1 > & output_offsets,
                            const at::TensorAccessor< offset_t, 1 > & tgt_start,
                            const at::TensorAccessor< scalar_t, 1 > & input,
                            const at::TensorAccessor< offset_t, 1 > & input_lengths,
                            const at::TensorAccessor< offset_t, 1 > & input_offsets,
                            const at::TensorAccessor< offset_t, 1 > & src_start,
                            const int64_t slice_length )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_softmax()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor > jagged_softmax (const Tensor & values,
                            const Tensor & offsets,
                            const int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_softmax_backward()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_softmax_backward (const Tensor & grad_output,
                            const Tensor & output,
                            const Tensor & offsets,
                            const int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_softmax_backward_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_softmax_backward_cuda (const Tensor & grad_output,
                            const Tensor & output,
                            const Tensor & offsets,
                            const int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_softmax_backward_kernel()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void jagged_softmax_backward_kernel (const at::TensorAccessor< scalar_t, 2 > & grad_output,
                            const at::TensorAccessor< scalar_t, 2 > & output,
                            const at::TensorAccessor< index_t, 1 > & offsets,
                            at::TensorAccessor< scalar_t, 2 > grad_input,
                            const int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_softmax_backward_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_softmax_backward_meta (const Tensor & grad_output,
                            const Tensor & output,
                            const Tensor & offsets,
                            const int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_softmax_forward()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor jagged_softmax_forward (const Tensor & values,
                            const Tensor & offsets,
                            const int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_softmax_forward_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor jagged_softmax_forward_cuda (const Tensor & values,
                            const Tensor & offsets,
                            const int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_softmax_forward_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor jagged_softmax_forward_meta (const Tensor & values,
                            const Tensor & offsets,
                            const int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_softmax_kernel()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + +
                            void jagged_softmax_kernel (const at::TensorAccessor< scalar_t, 2 > & values,
                            const at::TensorAccessor< index_t, 1 > & offsets,
                            at::TensorAccessor< scalar_t, 2 > output,
                            const int64_t max_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_to_padded_dense_backward()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            at::Tensor jagged_to_padded_dense_backward (const Tensor & grad_output,
                            const std::vector< Tensor > & offsets,
                            at::SymInt total_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_to_padded_dense_backward_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor jagged_to_padded_dense_backward_meta (const at::Tensor & grad_output,
                            const std::vector< Tensor > & offsets,
                            at::SymInt total_L )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_to_padded_dense_forward_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_to_padded_dense_forward_meta (const Tensor & values,
                            const std::vector< Tensor > & offsets,
                            c10::SymIntArrayRef max_lengths,
                            const double padding_value = 0 )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_to_padded_dense_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor jagged_to_padded_dense_meta (const Tensor & values,
                            const std::vector< Tensor > & offsets,
                            const c10::SymIntArrayRef max_lengths,
                            const double padding_value = 0 )
                            +
                            + +
                            +
                            + +

                            ◆ jagged_unique_indices_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor, Tensor, Tensor > jagged_unique_indices_cuda (const Tensor & hash_size_cumsum,
                            const Tensor & hash_size_offsets,
                            const Tensor & offsets,
                            const Tensor & indices )
                            +
                            + +
                            +
                            + +

                            ◆ keyed_jagged_index_add_dim1_kernel()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            __global__ void keyed_jagged_index_add_dim1_kernel (at::PackedTensorAccessor64< scalar_t, 1, at::RestrictPtrTraits > output,
                            const at::PackedTensorAccessor64< scalar_t, 1, at::RestrictPtrTraits > input,
                            const at::PackedTensorAccessor32< offset_t, 1, at::RestrictPtrTraits > input_offsets,
                            const at::PackedTensorAccessor32< index_t, 1, at::RestrictPtrTraits > indices,
                            const at::PackedTensorAccessor32< offset_t, 1, at::RestrictPtrTraits > output_offsets,
                            const int num_batches,
                            const int output_batch_size )
                            +
                            + +
                            +
                            + +

                            ◆ keyed_jagged_index_select_dim1_kernel()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            __global__ void keyed_jagged_index_select_dim1_kernel (at::PackedTensorAccessor64< scalar_t, 1, at::RestrictPtrTraits > output,
                            at::PackedTensorAccessor64< weight_t, 1, at::RestrictPtrTraits > output_weights,
                            const at::PackedTensorAccessor64< scalar_t, 1, at::RestrictPtrTraits > input,
                            const at::PackedTensorAccessor64< weight_t, 1, at::RestrictPtrTraits > weights,
                            const at::PackedTensorAccessor32< offset_t, 1, at::RestrictPtrTraits > input_offsets,
                            const at::PackedTensorAccessor32< index_t, 1, at::RestrictPtrTraits > indices,
                            const at::PackedTensorAccessor32< offset_t, 1, at::RestrictPtrTraits > output_offsets,
                            const int num_batches,
                            const int input_batch_size )
                            +
                            + +
                            +
                            + +

                            ◆ keyed_jagged_index_select_dim_1_gpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            std::vector< Tensor > keyed_jagged_index_select_dim_1_gpu (const Tensor & values,
                            const Tensor & lengths,
                            const Tensor & offsets,
                            const Tensor & indices,
                            const int64_t batch_size,
                            const c10::optional< Tensor > & weights )
                            +
                            + +
                            +
                            + +

                            ◆ lengths_range()

                            + +
                            +
                            + + + + + + + + + + + +
                            Tensor lengths_range (const Tensor & t_in,
                            const c10::optional< std::vector< int64_t > > & shape )
                            +
                            + +
                            +
                            + +

                            ◆ lengths_range_cuda()

                            + +
                            +
                            + + + + + + + + + + + +
                            DLL_PUBLIC Tensor lengths_range_cuda (const Tensor & t_in,
                            const c10::optional< std::vector< int64_t > > & shape )
                            +
                            + +
                            +
                            + +

                            ◆ lengths_range_out()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor & lengths_range_out (Tensor & output,
                            const Tensor & t_in,
                            const c10::optional< std::vector< int64_t > > & shape )
                            +
                            + +
                            +
                            + +

                            ◆ lfu_cache_find_uncached_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            std::pair< Tensor, Tensor > lfu_cache_find_uncached_cuda (Tensor unique_indices,
                            Tensor unique_indices_length,
                            int64_t max_indices,
                            Tensor lxu_cache_state,
                            Tensor lfu_state )
                            +
                            + +
                            +
                            + +

                            ◆ lfu_cache_populate_byte_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC void lfu_cache_populate_byte_cpu (Tensor weights,
                            Tensor cache_hash_size_cumsum,
                            int64_t total_cache_hash_size,
                            Tensor cache_index_table_map,
                            Tensor weights_offsets,
                            Tensor weights_tys,
                            Tensor D_offsets,
                            Tensor linear_cache_indices,
                            Tensor lxu_cache_state,
                            Tensor lxu_cache_weights,
                            Tensor lfu_state,
                            int64_t row_alignment )
                            +
                            + +
                            +
                            + +

                            ◆ lfu_update_counts_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            void lfu_update_counts_cuda (Tensor unique_indices,
                            Tensor unique_indices_length,
                            Tensor unique_indices_count,
                            Tensor lfu_state )
                            +
                            + +
                            +
                            + +

                            ◆ linearize_cache_indices_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC Tensor linearize_cache_indices_cpu (Tensor cache_hash_size_cumsum,
                            Tensor indices,
                            Tensor offsets )
                            +
                            + +
                            +
                            + +

                            ◆ linearize_cache_indices_from_row_idx_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC Tensor linearize_cache_indices_from_row_idx_cpu (Tensor cache_hash_size_cumsum,
                            Tensor update_table_indices,
                            Tensor update_row_indices )
                            +
                            + +
                            +
                            + +

                            ◆ load_qparams_from_row()

                            + +
                            +
                            +
                            +template<typename emb_t >
                            + + + + + + + +
                            DEVICE_INLINE float2 load_qparams_from_row (emb_t * qparam_ptr)
                            +
                            + +
                            +
                            + +

                            ◆ lookup_batched_unary_embedding_function()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor lookup_batched_unary_embedding_function (const Tensor & weight,
                            const Tensor & table_offsets,
                            const Tensor & offsets,
                            const Tensor & indices )
                            +
                            + +
                            +
                            + +

                            ◆ lru_cache_populate_byte_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC void lru_cache_populate_byte_cpu (Tensor weights,
                            Tensor cache_hash_size_cumsum,
                            int64_t total_cache_hash_size,
                            Tensor cache_index_table_map,
                            Tensor weights_offsets,
                            Tensor weights_tys,
                            Tensor D_offsets,
                            Tensor linear_cache_indices,
                            Tensor lxu_cache_state,
                            Tensor lxu_cache_weights,
                            int64_t time_stamp,
                            Tensor lru_state,
                            int64_t row_alignment,
                            bool gather_cache_stats,
                            c10::optional< Tensor > uvm_cache_stats )
                            +
                            + +
                            +
                            + +

                            ◆ lxu_cache_lookup_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC Tensor lxu_cache_lookup_cpu (Tensor linear_cache_indices,
                            Tensor lxu_cache_state,
                            int64_t invalid_index,
                            bool gather_cache_stats,
                            c10::optional< Tensor > uvm_cache_stats,
                            c10::optional< Tensor > num_uniq_cache_indices,
                            c10::optional< Tensor > lxu_cache_locations_output )
                            +
                            + +
                            +
                            + +

                            ◆ make_zero_float2()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE float2 make_zero_float2 ()
                            +
                            + +
                            +
                            + +

                            ◆ make_zero_float4()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE float4 make_zero_float4 ()
                            +
                            + +
                            +
                            + +

                            ◆ make_zero_float8()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE float8 make_zero_float8 ()
                            +
                            + +
                            +
                            + +

                            ◆ make_zero_float_16()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE float_16 make_zero_float_16 ()
                            +
                            + +
                            +
                            + +

                            ◆ masked_select_jagged_1d()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor > masked_select_jagged_1d (const Tensor & values,
                            const Tensor & lengths,
                            const Tensor & mask )
                            +
                            + +
                            +
                            + +

                            ◆ max()

                            + +
                            +
                            +
                            +template<typename T >
                            + + + + + +
                            + + + + + + + + + + + +
                            __device__ T max (const T * from,
                            const T * to )
                            +
                            +inline
                            +
                            + +
                            +
                            + +

                            ◆ merge_pooled_embeddings()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor merge_pooled_embeddings (std::vector< Tensor > pooled_embeddings,
                            int64_t uncat_dim_size,
                            at::Device target_device,
                            int64_t cat_dim = 1 )
                            +
                            + +
                            +
                            + +

                            ◆ merge_pooled_embeddings_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor merge_pooled_embeddings_cpu (std::vector< Tensor > pooled_embeddings,
                            int64_t ,
                            at::Device target_device,
                            int64_t cat_dim = 1 )
                            +
                            + +
                            +
                            + +

                            ◆ min()

                            + +
                            +
                            +
                            +template<typename T >
                            + + + + + +
                            + + + + + + + + + + + +
                            __device__ T min (const T * from,
                            const T * to )
                            +
                            +inline
                            +
                            + +
                            +
                            + +

                            ◆ native_empty_like()

                            + +
                            +
                            + + + + + + + +
                            Tensor native_empty_like (const Tensor & self)
                            +
                            + +
                            +
                            + +

                            ◆ nearest_rounding_vector() [1/4]

                            + +
                            +
                            +
                            +template<typename dst_t , typename src_t >
                            + + + + + + + + + + + + + + + + +
                            DEVICE_INLINE void nearest_rounding_vector (dst_t * output,
                            const Vec4T< src_t > & value,
                            const float2  )
                            +
                            + +
                            +
                            + +

                            ◆ nearest_rounding_vector() [2/4]

                            + +
                            +
                            +
                            +template<>
                            + + + + + + + + + + + + + + + + +
                            DEVICE_INLINE void nearest_rounding_vector (uint8_t * output,
                            const Vec4T< at::Half > & value,
                            const float2 qparams )
                            +
                            + +
                            +
                            + +

                            ◆ nearest_rounding_vector() [3/4]

                            + +
                            +
                            +
                            +template<>
                            + + + + + + + + + + + + + + + + +
                            DEVICE_INLINE void nearest_rounding_vector (uint8_t * output,
                            const Vec4T< double > & value,
                            const float2 qparams )
                            +
                            + +
                            +
                            + +

                            ◆ nearest_rounding_vector() [4/4]

                            + +
                            +
                            +
                            +template<>
                            + + + + + + + + + + + + + + + + +
                            DEVICE_INLINE void nearest_rounding_vector (uint8_t * output,
                            const Vec4T< float > & value,
                            const float2 qparams )
                            +
                            + +
                            +
                            + +

                            ◆ new_unified_tensor_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor new_unified_tensor_cpu (const Tensor & self,
                            const std::vector< std::int64_t > & sizes,
                            bool is_host_mapped )
                            +
                            + +
                            +
                            + +

                            ◆ offset_tbe_input_combine_with_length_args()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void offset_tbe_input_combine_with_length_args (uint64_t ** indices_addrs,
                            uint64_t ** lengths_addrs,
                            uint64_t ** indices_offsets,
                            uint64_t ** lengths_offsets,
                            uint64_t ** per_sample_weights_addrs,
                            uint32_t ** indices_is_long,
                            uint32_t ** lengths_is_long,
                            uint64_t * base_addr,
                            const uint64_t *const ptr_offsets,
                            const bool need_weights )
                            +
                            + +
                            +
                            + +

                            ◆ offsets_range_cpu()

                            + +
                            +
                            + + + + + + + + + + + +
                            Tensor offsets_range_cpu (const Tensor & offsets,
                            int64_t range_size )
                            +
                            + +
                            +
                            + +

                            ◆ offsets_range_cuda()

                            + +
                            +
                            + + + + + + + + + + + +
                            DLL_PUBLIC Tensor offsets_range_cuda (const Tensor & offsets,
                            int64_t range_size )
                            +
                            + +
                            +
                            + +

                            ◆ pack_segments_autograd()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor pack_segments_autograd (const Tensor & t_in,
                            const Tensor & lengths,
                            const at::SymInt max_length )
                            +
                            + +
                            +
                            + +

                            ◆ pack_segments_backward_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor pack_segments_backward_cpu (const Tensor & data,
                            const Tensor & lengths,
                            const int64_t total_length,
                            const int64_t max_length )
                            +
                            +

                            Map N+1 dim tensor to N dim based on lengths tensor Sequences that are shorter than the longest sequence are padded with zeros.

                            Parameters
                            + + + + + +
                            dataN+1 dim Tensor.
                            lengths1D int/long tensor contains the length in each of the input.
                            total_lengthSum of elements in the 1D tensor legnths
                            max_lengthThe pre-defined max_length for the packed segments. -1 means autodetect
                            +
                            +
                            +
                            Returns
                            unpacked_tensor N-dimensional tensor
                            + +
                            +
                            + +

                            ◆ pack_segments_backward_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC Tensor pack_segments_backward_cuda (const Tensor & data,
                            const Tensor & lengths,
                            int64_t total_length,
                            int64_t max_length )
                            +
                            +

                            Map N+1 dim tensor to N dim based on lengths tensor Sequences that are shorter than the longest sequence are padded with zeros.

                            Parameters
                            + + + + + +
                            dataN+1 dim Tensor.
                            lengths1D int/long tensor contains the length in each of the input.
                            total_lengthSum of elements in the 1D tensor legnths
                            max_lengthThe pre-defined max_length for the packed segments.
                            +
                            +
                            +
                            Returns
                            unpacked_tensor N-dimensional tensor
                            + +
                            +
                            + +

                            ◆ pack_segments_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor pack_segments_cpu (const Tensor & t_in,
                            const Tensor & lengths,
                            const int64_t max_length )
                            +
                            + +
                            +
                            + +

                            ◆ pack_segments_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor pack_segments_cuda (const Tensor & t_in,
                            const Tensor & lengths,
                            const int64_t max_length )
                            +
                            + +
                            +
                            + +

                            ◆ pack_segments_cuda_kernel()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            __global__ void pack_segments_cuda_kernel (const Data_T *const data_ptr,
                            const int64_t data_size_0,
                            const Length_T *const lengths_ptr,
                            const Length_T *const lengths_cum_sum,
                            const Length_T max_length,
                            const int64_t num_seq,
                            const int64_t cell_size,
                            const Data_T padding,
                            Data_T *const out_ptr,
                            TORCH_DSA_KERNEL_ARGS  )
                            +
                            + +
                            +
                            + +

                            ◆ pack_segments_forward_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor pack_segments_forward_cpu (const Tensor & t_in,
                            const Tensor & lengths,
                            const int64_t max_length )
                            +
                            +

                            Map N dim tensor to N+1 dim based on lengths tensor. Sequences that are shorter than the longest sequence are padded with zeros.

                            Parameters
                            + + + + +
                            t_inN dim Tensor.
                            lengths1D int/long tensor contains the length in each of the output.
                            max_lengthThe pre-defined max_length for the packed segments. -1 means autodetect
                            +
                            +
                            +
                            Returns
                            packed_tensor packed_tensor N + 1 dim Tensor where dim(1) is the max length, dim(0) is the batch size.
                            + +
                            +
                            + +

                            ◆ pack_segments_forward_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC Tensor pack_segments_forward_cuda (const Tensor & t_in,
                            const Tensor & lengths,
                            const int64_t max_length )
                            +
                            +

                            Map N dim tensor to N+1 dim based on lengths tensor. Sequences that are shorter than the longest sequence are padded with zeros.

                            Parameters
                            + + + + +
                            t_inN dim Tensor.
                            lengths1D int/long tensor contains the length in each of the output.
                            max_lengthThe pre-defined max_length for the packed segments.
                            +
                            +
                            +
                            Returns
                            packed_tensor packed_tensor N + 1 dim Tensor where dim(1) is the max length, dim(0) is the batch size.
                            + +
                            +
                            + +

                            ◆ padding_fused_tbe_input_combine_with_length_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor, Tensor > padding_fused_tbe_input_combine_with_length_cpu (const std::vector< Tensor > & indices_list,
                            const std::vector< Tensor > & lengths_list,
                            const std::vector< Tensor > & per_sample_weights,
                            int64_t batch_size )
                            +
                            +

                            padding_fused_tbe_input_combine_with_length_cpu is similar to tbe_input_combine_with_length_cpu, but padding all the lengths to the size specified by batch_size.

                            +
                            Parameters
                            + + + + +
                            indices_listlist of indices.
                            lengths_listlist of lengths.
                            per_sample_weightslist of per_sample_weights
                            +
                            +
                            +
                            Returns
                            tuple of combined indices, lengths, and per_sample_weights
                            + +
                            +
                            + +

                            ◆ permute102_baddbmm_permute102_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor permute102_baddbmm_permute102_cpu (const Tensor & bias,
                            const Tensor & A,
                            const Tensor & B )
                            +
                            + +
                            +
                            + +

                            ◆ permute102_baddbmm_permute102_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC Tensor permute102_baddbmm_permute102_cuda (const Tensor & bias,
                            const Tensor & A,
                            const Tensor & B )
                            +
                            + +
                            +
                            + +

                            ◆ permute_1D_sparse_data_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor, c10::optional< Tensor > > permute_1D_sparse_data_cpu (const Tensor & permute,
                            const Tensor & lengths,
                            const Tensor & indices,
                            const c10::optional< Tensor > & weights,
                            const c10::optional< int64_t > & permuted_lengths_sum )
                            +
                            + +
                            +
                            + +

                            ◆ permute_2D_sparse_data_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor, c10::optional< Tensor > > permute_2D_sparse_data_cpu (const Tensor & permute,
                            const Tensor & lengths,
                            const Tensor & indices,
                            const c10::optional< Tensor > & weights,
                            const c10::optional< int64_t > & permuted_lengths_sum )
                            +
                            + +
                            +
                            + +

                            ◆ permute_duplicate_pooled_embs_auto_grad_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            at::Tensor permute_duplicate_pooled_embs_auto_grad_cpu (const Tensor & pooled_embs,
                            const Tensor & offset_dim_list,
                            const Tensor & permute_list,
                            const Tensor & inv_offset_dim_list,
                            const Tensor & inv_permute_list )
                            +
                            + +
                            +
                            + +

                            ◆ permute_duplicate_pooled_embs_auto_grad_gpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor permute_duplicate_pooled_embs_auto_grad_gpu (const Tensor & pooled_embs,
                            const Tensor & offset_dim_list,
                            const Tensor & permute_list,
                            const Tensor & inv_offset_dim_list,
                            const Tensor & inv_permute_list )
                            +
                            + +
                            +
                            + +

                            ◆ permute_duplicate_pooled_embs_auto_grad_split_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor permute_duplicate_pooled_embs_auto_grad_split_cpu (const Tensor & pooled_embs,
                            const Tensor & offset_dim_list,
                            const Tensor & permute_list,
                            const Tensor & inv_offset_dim_list,
                            const Tensor & inv_permute_list )
                            +
                            + +
                            +
                            + +

                            ◆ permute_duplicate_pooled_embs_auto_grad_split_gpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor permute_duplicate_pooled_embs_auto_grad_split_gpu (const at::Tensor & pooled_embs,
                            const at::Tensor & offset_dim_list,
                            const at::Tensor & permute_list,
                            const at::Tensor & inv_offset_dim_list,
                            const at::Tensor & inv_permute_list )
                            +
                            + +
                            +
                            + +

                            ◆ permute_duplicate_pooled_embs_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            at::Tensor permute_duplicate_pooled_embs_cpu (const at::Tensor & pooled_embs,
                            const at::Tensor & offset_dim_list,
                            const at::Tensor & permute_list,
                            const at::Tensor & inv_offset_dim_list,
                            const at::Tensor & inv_permute_list )
                            +
                            + +
                            +
                            + +

                            ◆ permute_duplicate_pooled_embs_gpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor permute_duplicate_pooled_embs_gpu (const at::Tensor & pooled_embs,
                            const at::Tensor & offset_dim_list,
                            const at::Tensor & permute_list,
                            const at::Tensor & inv_offset_dim_list,
                            const at::Tensor & inv_permute_list )
                            +
                            + +
                            +
                            + +

                            ◆ permute_duplicate_pooled_embs_split_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor permute_duplicate_pooled_embs_split_cpu (const Tensor & pooled_embs,
                            const Tensor & offset_dim_list,
                            const Tensor & permute_list,
                            const Tensor & inv_offset_dim_list,
                            const Tensor & inv_permute_list )
                            +
                            + +
                            +
                            + +

                            ◆ permute_duplicate_pooled_embs_split_gpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor permute_duplicate_pooled_embs_split_gpu (const at::Tensor & pooled_embs,
                            const at::Tensor & offset_dim_list,
                            const at::Tensor & permute_list,
                            const at::Tensor & inv_offset_dim_list,
                            const at::Tensor & inv_permute_list )
                            +
                            + +
                            +
                            + +

                            ◆ permute_embeddings_kernel()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            __global__ void permute_embeddings_kernel (int32_t len,
                            int32_t T,
                            int32_t B,
                            const scalar_t *__restrict__ embeddings,
                            const int32_t *__restrict__ permute,
                            const index_t *__restrict__ input_offsets,
                            const index_t *__restrict__ output_offsets,
                            scalar_t *__restrict__ permuted_embeddings )
                            +
                            + +
                            +
                            + +

                            ◆ permute_pooled_embs_auto_grad_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            at::Tensor permute_pooled_embs_auto_grad_meta (const Tensor & pooled_embs,
                            const Tensor & ,
                            const Tensor & ,
                            const Tensor & ,
                            const Tensor &  )
                            +
                            + +
                            +
                            + +

                            ◆ permute_pooled_embs_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            at::Tensor permute_pooled_embs_cpu (const at::Tensor & pooled_embs,
                            const at::Tensor & offset_dim_list,
                            const at::Tensor & permute_list,
                            const at::Tensor & inv_offset_dim_list,
                            const at::Tensor & inv_permute_list )
                            +
                            + +
                            +
                            + +

                            ◆ permute_pooled_embs_gpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor permute_pooled_embs_gpu (const at::Tensor & pooled_embs,
                            const at::Tensor & offset_dim_list,
                            const at::Tensor & permute_list,
                            const at::Tensor & inv_offset_dim_list,
                            const at::Tensor & inv_permute_list )
                            +
                            + +
                            +
                            + +

                            ◆ permute_pooled_embs_gpu_impl()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor permute_pooled_embs_gpu_impl (const at::Tensor & pooled_embs,
                            const at::Tensor & offset_dim_list,
                            const at::Tensor & permute_list,
                            const at::Tensor & inv_offset_dim_list,
                            const at::Tensor & inv_permute_list,
                            const bool & allow_duplicates = false )
                            +
                            + +
                            +
                            + +

                            ◆ permute_pooled_embs_meta()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            at::Tensor permute_pooled_embs_meta (const Tensor & pooled_embs,
                            const Tensor & ,
                            const Tensor & ,
                            const Tensor & ,
                            const Tensor &  )
                            +
                            + +
                            +
                            + +

                            ◆ permute_pooled_embs_split_cpu_impl()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor permute_pooled_embs_split_cpu_impl (const Tensor & pooled_embs,
                            const Tensor & offset_dim_list,
                            const Tensor & permute_list,
                            const Tensor & inv_offset_dim_list,
                            const Tensor & inv_permute_list,
                            const bool & allow_duplicates )
                            +
                            + +
                            +
                            + +

                            ◆ permute_pooled_embs_split_gpu_impl()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor permute_pooled_embs_split_gpu_impl (const at::Tensor & pooled_embs,
                            const at::Tensor & offset_dim_list,
                            const at::Tensor & permute_list,
                            const at::Tensor & inv_offset_dim_list,
                            const at::Tensor & inv_permute_list,
                            const bool & allow_duplicates )
                            +
                            + +
                            +
                            + +

                            ◆ permute_sequence_embeddings_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor > permute_sequence_embeddings_cpu (const Tensor & permute,
                            const Tensor & lengths,
                            const Tensor & embeddings )
                            +
                            + +
                            +
                            + +

                            ◆ permute_sequence_embeddings_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC std::tuple< Tensor, Tensor > permute_sequence_embeddings_cuda (const Tensor & permute,
                            const Tensor & lengths,
                            const Tensor & embeddings )
                            +
                            + +
                            +
                            + +

                            ◆ permute_sparse_features_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor, c10::optional< Tensor > > permute_sparse_features_cpu (const Tensor & permute,
                            const Tensor & lengths,
                            const Tensor & indices,
                            const c10::optional< Tensor > & weights )
                            +
                            + +
                            +
                            + +

                            ◆ prefix_sum()

                            + +
                            +
                            +
                            +template<typename T >
                            + + + + + + + + + + + + + + + + +
                            void prefix_sum (const int length,
                            const T *const array,
                            T *const presum )
                            +
                            + +
                            +
                            + +

                            ◆ pruned_array_lookup_from_row_idx_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor pruned_array_lookup_from_row_idx_cpu (const Tensor & update_row_indices,
                            const Tensor & update_table_indices,
                            const Tensor & index_remappings,
                            const Tensor & index_remappings_offsets )
                            +
                            + +
                            +
                            + +

                            ◆ pruned_array_lookup_from_row_idx_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor pruned_array_lookup_from_row_idx_cuda (const Tensor & update_row_indices,
                            const Tensor & update_table_indices,
                            const Tensor & index_remappings,
                            const Tensor & index_remappings_offsets )
                            +
                            +

                            Index remapping function that returns the remapped indices.

                            +

                            Args: update_row_indices: row indices for every new row update_table_indices: table indices for every new row index_remappings: concated index remapping for every embedding table index_remappings_offsets: offset for each embedding table

                            +

                            Returns: remapped indices for each new row.

                            + +
                            +
                            + +

                            ◆ quantize_store()

                            + +
                            +
                            +
                            +template<typename dst_t , typename src_t >
                            + + + + + + + + + + + + + + + + + + + + + +
                            DEVICE_INLINE void quantize_store (dst_t * output,
                            const Vec4T< src_t > & value,
                            StochasticRoundingRNGState * state,
                            const float2 qparams )
                            +
                            + +
                            +
                            + +

                            ◆ reorder_batched_ad_indices_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor reorder_batched_ad_indices_cpu (const Tensor & cat_ad_offsets,
                            const Tensor & cat_ad_indices,
                            const Tensor & reordered_cat_ad_offsets,
                            const Tensor & batch_offsets,
                            const int64_t num_ads_in_batch,
                            const bool broadcast_indices,
                            const int64_t num_indices_after_broadcast )
                            +
                            + +
                            +
                            + +

                            ◆ reorder_batched_ad_indices_cpu_()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void reorder_batched_ad_indices_cpu_ (const Tensor & cat_ad_offsets,
                            const Tensor & cat_ad_indices,
                            const Tensor & reordered_cat_ad_offsets,
                            const Tensor & batch_offsets,
                            const int64_t num_ads_in_batch,
                            const bool broadcast_indices,
                            Tensor & output )
                            +
                            + +
                            +
                            + +

                            ◆ reorder_batched_ad_indices_gpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC Tensor reorder_batched_ad_indices_gpu (const Tensor & cat_ad_offsets,
                            const Tensor & cat_ad_indices,
                            const Tensor & reordered_cat_ad_offsets,
                            const Tensor & batch_offsets,
                            const int64_t num_ads_in_batch,
                            const bool broadcast_indices,
                            const int64_t num_indices_after_broadcast )
                            +
                            + +
                            +
                            + +

                            ◆ reorder_batched_ad_lengths_()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void reorder_batched_ad_lengths_ (const Tensor & cat_ad_lengths,
                            const Tensor & batch_offsets,
                            const int64_t num_ads_in_batch,
                            const bool broadcast_lengths,
                            Tensor & output )
                            +
                            + +
                            +
                            + +

                            ◆ reorder_batched_ad_lengths_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            Tensor reorder_batched_ad_lengths_cpu (const Tensor & cat_ad_lengths,
                            const Tensor & batch_offsets,
                            const int64_t num_ads_in_batch,
                            const bool broadcast_lengths )
                            +
                            + +
                            +
                            + +

                            ◆ reorder_batched_ad_lengths_gpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC Tensor reorder_batched_ad_lengths_gpu (const Tensor & cat_ad_lengths,
                            const Tensor & batch_offsets,
                            const int64_t num_ads_in_batch,
                            const bool broadcast_lengths )
                            +
                            + +
                            +
                            + +

                            ◆ report_embedding_error()

                            + +
                            +
                            +
                            +template<typename IndexType >
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void report_embedding_error (int t,
                            int B,
                            int b_begin,
                            int b_end,
                            const IndexType * offsets_data,
                            const IndexType * indices_data,
                            int64_t hash_size,
                            bool allow_minus_one = false )
                            +
                            +

                            report error from fbgemm cpu embedding lookup kernels @params allow_minus_one true for embedding kernels generated with scale_bias_last == false that can take -1 indices (output from pruned embedding id mapping)

                            + +
                            +
                            + +

                            ◆ rk_double()

                            + +
                            +
                            + + + + + + + +
                            __device__ double rk_double (rk_state * state)
                            +
                            + +
                            +
                            + +

                            ◆ rk_random()

                            + +
                            +
                            + + + + + + + +
                            __device__ unsigned long rk_random (rk_state * state)
                            +
                            + +
                            +
                            + +

                            ◆ rk_seed()

                            + +
                            +
                            + + + + + + + + + + + +
                            __device__ void rk_seed (unsigned long long s,
                            rk_state * state )
                            +
                            + +
                            +
                            + +

                            ◆ rk_zipf()

                            + +
                            +
                            + + + + + + + + + + + +
                            __device__ long rk_zipf (rk_state * state,
                            double a )
                            +
                            + +
                            +
                            + +

                            ◆ round_down()

                            + +
                            +
                            + + + + + + + + + + + +
                            __host__ DEVICE_INLINE int32_t round_down (int32_t a,
                            int32_t b )
                            +
                            + +
                            +
                            + +

                            ◆ segment_sum_csr_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            Tensor segment_sum_csr_cpu (const int64_t batch_size,
                            const Tensor & csr_seg,
                            const Tensor & values )
                            +
                            + +
                            +
                            + +

                            ◆ segment_sum_csr_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC Tensor segment_sum_csr_cuda (const int64_t batch_size,
                            const Tensor & csr_seg,
                            const Tensor & values )
                            +
                            + +
                            +
                            + +

                            ◆ shfl_down_sync()

                            + +
                            +
                            +
                            +template<typename T >
                            + + + + + + + + + + + + + + + + + + + + + +
                            DEVICE_INLINE T shfl_down_sync (const T val,
                            unsigned delta,
                            int width = kWarpSize,
                            unsigned shfl_sync_mask = kFullWarpMask )
                            +
                            + +
                            +
                            + +

                            ◆ shfl_sync()

                            + +
                            +
                            +
                            +template<typename T >
                            + + + + + + + + + + + + + + + + + + + + + +
                            DEVICE_INLINE T shfl_sync (const T val,
                            int srcLane = 0,
                            int width = kWarpSize,
                            unsigned shfl_sync_mask = kFullWarpMask )
                            +
                            + +
                            +
                            + +

                            ◆ shfl_xor()

                            + +
                            +
                            +
                            +template<typename T >
                            + + + + + + + + + + + + + + + + + + + + + +
                            DEVICE_INLINE T shfl_xor (const T val,
                            int laneMask,
                            int width = kWarpSize,
                            unsigned shfl_sync_mask = kFullWarpMask )
                            +
                            + +
                            +
                            + +

                            ◆ should_prune()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            bool should_prune (const Tensor & weights,
                            const int64_t num_rows_kept,
                            double min_save_ratio )
                            +
                            + +
                            +
                            + +

                            ◆ splitmix64_stateless()

                            + +
                            +
                            + + + + + + + +
                            __host__ DEVICE_INLINE uint64_t splitmix64_stateless (uint64_t index)
                            +
                            + +
                            +
                            + +

                            ◆ stacked_jagged_1d_to_dense_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            std::vector< Tensor > stacked_jagged_1d_to_dense_cpu (Tensor values,
                            Tensor lengths,
                            const std::vector< int64_t > & offset_per_key,
                            const std::vector< int64_t > & max_lengths_per_key,
                            int64_t padding_value )
                            +
                            + +
                            +
                            + +

                            ◆ stacked_jagged_1d_to_dense_gpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            std::vector< Tensor > stacked_jagged_1d_to_dense_gpu (Tensor values,
                            Tensor lengths,
                            const std::vector< int64_t > & offset_per_key,
                            const std::vector< int64_t > & max_lengths_per_key,
                            int64_t padding_value )
                            +
                            + +
                            +
                            + +

                            ◆ stacked_jagged_2d_to_dense_backward_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            Tensor stacked_jagged_2d_to_dense_backward_cuda (int64_t B,
                            int64_t D,
                            int64_t total_L,
                            const std::vector< Tensor > & grad_padded_values_per_key,
                            const std::vector< Tensor > & offsets_tensor_per_key,
                            const std::vector< int64_t > & offset_per_key )
                            +
                            + +
                            +
                            + +

                            ◆ stacked_jagged_2d_to_dense_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            std::vector< Tensor > stacked_jagged_2d_to_dense_cpu (Tensor values,
                            Tensor lengths,
                            const std::vector< int64_t > & offset_per_key,
                            const std::vector< int64_t > & max_lengths_per_key,
                            int64_t padding_value )
                            +
                            + +
                            +
                            + +

                            ◆ stacked_jagged_2d_to_dense_forward_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< std::vector< Tensor >, std::vector< Tensor > > stacked_jagged_2d_to_dense_forward_cuda (Tensor values,
                            Tensor lengths,
                            const std::vector< int64_t > & offset_per_key,
                            const std::vector< int64_t > & max_lengths_per_key,
                            int64_t padding_value )
                            +
                            + +
                            +
                            + +

                            ◆ stacked_jagged_2d_to_dense_gpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            std::vector< Tensor > stacked_jagged_2d_to_dense_gpu (Tensor values,
                            Tensor lengths,
                            const std::vector< int64_t > & offset_per_key,
                            const std::vector< int64_t > & max_lengths_per_key,
                            int64_t padding_value )
                            +
                            + +
                            +
                            + +

                            ◆ stochastic_rounding_init()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            DEVICE_INLINE void stochastic_rounding_init (uint64_t s0,
                            uint64_t s1,
                            StochasticRoundingRNGState * state )
                            +
                            + +
                            +
                            + +

                            ◆ stochastic_rounding_rand4()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE uint4 stochastic_rounding_rand4 (StochasticRoundingRNGState * state)
                            +
                            + +
                            +
                            + +

                            ◆ stochastic_rounding_vector() [1/5]

                            + +
                            +
                            +
                            +template<>
                            + + + + + + + + + + + + + + + + + + + + + +
                            DEVICE_INLINE void stochastic_rounding_vector (at::Half * output,
                            const Vec4T< at::Half > & value,
                            StochasticRoundingRNGState & state,
                            const float2  )
                            +
                            + +
                            +
                            + +

                            ◆ stochastic_rounding_vector() [2/5]

                            + +
                            +
                            +
                            +template<>
                            + + + + + + + + + + + + + + + + + + + + + +
                            DEVICE_INLINE void stochastic_rounding_vector (at::Half * output,
                            const Vec4T< float > & value,
                            StochasticRoundingRNGState & state,
                            const float2  )
                            +
                            + +
                            +
                            + +

                            ◆ stochastic_rounding_vector() [3/5]

                            + +
                            +
                            +
                            +template<typename dst_t , typename src_t >
                            + + + + + + + + + + + + + + + + + + + + + +
                            DEVICE_INLINE void stochastic_rounding_vector (dst_t * output,
                            const Vec4T< src_t > & value,
                            StochasticRoundingRNGState & state,
                            const float2  )
                            +
                            + +
                            +
                            + +

                            ◆ stochastic_rounding_vector() [4/5]

                            + +
                            +
                            +
                            +template<>
                            + + + + + + + + + + + + + + + + + + + + + +
                            DEVICE_INLINE void stochastic_rounding_vector (uint8_t * output,
                            const Vec4T< at::Half > & value,
                            StochasticRoundingRNGState & state,
                            const float2 qparams )
                            +
                            + +
                            +
                            + +

                            ◆ stochastic_rounding_vector() [5/5]

                            + +
                            +
                            +
                            +template<>
                            + + + + + + + + + + + + + + + + + + + + + +
                            DEVICE_INLINE void stochastic_rounding_vector (uint8_t * output,
                            const Vec4T< float > & value,
                            StochasticRoundingRNGState & state,
                            const float2 qparams )
                            +
                            + +
                            +
                            + +

                            ◆ store_qparams_to_row() [1/2]

                            + +
                            +
                            +
                            +template<typename emb_t >
                            + + + + + + + + + + + +
                            DEVICE_INLINE void store_qparams_to_row (emb_t * ptr,
                            float2 qparams )
                            +
                            + +
                            +
                            + +

                            ◆ store_qparams_to_row() [2/2]

                            + +
                            +
                            +
                            +template<>
                            + + + + + + + + + + + +
                            DEVICE_INLINE void store_qparams_to_row (uint8_t * ptr,
                            float2 qparams )
                            +
                            + +
                            +
                            + +

                            ◆ sum_reduce_to_one_device()

                            + +
                            +
                            + + + + + + + + + + + +
                            Tensor sum_reduce_to_one_device (std::vector< Tensor > input_tensors,
                            at::Device target_device )
                            +
                            + +
                            +
                            + +

                            ◆ syncwarp()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE void syncwarp ()
                            +
                            + +
                            +
                            + +

                            ◆ tbe_input_combine_with_length_cpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor, Tensor > tbe_input_combine_with_length_cpu (const std::vector< Tensor > & indices_list,
                            const std::vector< Tensor > & lengths_list,
                            const std::vector< Tensor > & per_sample_weights )
                            +
                            + +
                            +
                            + +

                            ◆ tbe_input_combine_with_length_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor, Tensor > tbe_input_combine_with_length_cuda (const uint64_t *const indices_addrs,
                            const uint64_t *const lengths_addrs,
                            const uint64_t *const per_sample_weights_addrs,
                            const uint32_t *const indices_is_long,
                            const uint32_t *const lengths_is_long,
                            const uint64_t *const indices_offsets,
                            const uint64_t *const lengths_offsets,
                            const uint64_t num_lists,
                            const uint64_t total_indices,
                            const uint64_t total_lengths,
                            const uint64_t max_list_size,
                            const c10::DeviceIndex & device )
                            +
                            + +
                            +
                            + +

                            ◆ tbe_input_combine_with_length_gpu()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            std::tuple< Tensor, Tensor, Tensor > tbe_input_combine_with_length_gpu (const std::vector< Tensor > & indices_list,
                            const std::vector< Tensor > & lengths_list,
                            const std::vector< Tensor > & per_sample_weights )
                            +
                            + +
                            +
                            + +

                            ◆ thrust_find_qparams() [1/2]

                            + +
                            +
                            +
                            +template<typename scalar_t >
                            + + + + + + + + + + + +
                            __device__ float2 thrust_find_qparams (fbgemm_gpu::Vec4T< scalar_t > * input_row,
                            int D )
                            +
                            + +
                            +
                            + +

                            ◆ thrust_find_qparams() [2/2]

                            + +
                            +
                            +
                            +template<typename scalar_t >
                            + + + + + + + + + + + +
                            __device__ float2 thrust_find_qparams (scalar_t * input_row,
                            int D )
                            +
                            + +
                            +
                            + +

                            ◆ to_bfloat16()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE __nv_bfloat16 to_bfloat16 (float v)
                            +
                            + +
                            +
                            + +

                            ◆ to_bfloat16_16()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE bfloat16_16 to_bfloat16_16 (float_16 v)
                            +
                            + +
                            +
                            + +

                            ◆ to_bfloat16_2()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE __nv_bfloat162 to_bfloat16_2 (float2 v)
                            +
                            + +
                            +
                            + +

                            ◆ to_bfloat16_4()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE bfloat16_4 to_bfloat16_4 (float4 v)
                            +
                            + +
                            +
                            + +

                            ◆ to_bfloat16_8()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE bfloat16_8 to_bfloat16_8 (float8 v)
                            +
                            + +
                            +
                            + +

                            ◆ to_half()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE __half to_half (float v)
                            +
                            + +
                            +
                            + +

                            ◆ to_half16()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE half16 to_half16 (float_16 v)
                            +
                            + +
                            +
                            + +

                            ◆ to_half2()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE __half2 to_half2 (float2 v)
                            +
                            + +
                            +
                            + +

                            ◆ to_half4()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE half4 to_half4 (float4 v)
                            +
                            + +
                            +
                            + +

                            ◆ to_half8()

                            + +
                            +
                            + + + + + + + +
                            DEVICE_INLINE half8 to_half8 (float8 v)
                            +
                            + +
                            +
                            + +

                            ◆ TORCH_LIBRARY_FRAGMENT()

                            + +
                            +
                            + + + + + + + + + + + +
                            TORCH_LIBRARY_FRAGMENT (fbgemm ,
                            m  )
                            +
                            + +
                            +
                            + +

                            ◆ TORCH_LIBRARY_IMPL()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            TORCH_LIBRARY_IMPL (fbgemm ,
                            CUDA ,
                            m  )
                            +
                            + +
                            +
                            + +

                            ◆ trapz_kernel()

                            + +
                            +
                            +
                            +template<typename scalar_t >
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            __inline__ __device__ void trapz_kernel (scalar_t * output,
                            const scalar_t * y,
                            const scalar_t * x,
                            const scalar_t * block_y,
                            const scalar_t * block_x,
                            const int num_entries_per_block,
                            const int block_id )
                            +
                            + +
                            +
                            + +

                            ◆ unpack_segments_cuda_kernel()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            __global__ void unpack_segments_cuda_kernel (const Data_T *const data_ptr,
                            const Length_T *const lengths_ptr,
                            const Length_T *const lengths_cum_sum,
                            const Length_T max_length,
                            const int64_t num_seq,
                            const int64_t cell_size,
                            Data_T *const out_ptr )
                            +
                            + +
                            +
                            + +

                            ◆ vec4_acc()

                            + +
                            +
                            +
                            +template<typename scalar_t >
                            + + + + + + + + + + + +
                            DEVICE_INLINE Vec4T< scalar_t > vec4_acc (const Vec4T< scalar_t > & lhs,
                            const Vec4T< scalar_t > & rhs )
                            +
                            + +
                            +
                            + +

                            ◆ vec4_max()

                            + +
                            +
                            +
                            +template<typename scalar_t >
                            + + + + + + + +
                            DEVICE_INLINE scalar_t vec4_max (const fbgemm_gpu::Vec4T< scalar_t > & vec4)
                            +
                            + +
                            +
                            + +

                            ◆ vec4_min()

                            + +
                            +
                            +
                            +template<typename scalar_t >
                            + + + + + + + +
                            DEVICE_INLINE scalar_t vec4_min (const fbgemm_gpu::Vec4T< scalar_t > & vec4)
                            +
                            + +
                            +
                            + +

                            ◆ vec_copy_with_implicit_type_cast()

                            + +
                            +
                            +
                            +template<typename src_t , typename dst_t , uint32_t VEC_WIDTH>
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            DEVICE_INLINE void vec_copy_with_implicit_type_cast (dst_t *const __restrict__ dst,
                            const uint64_t src_addr,
                            const uint64_t src_offset,
                            const uint64_t dst_offset,
                            const uint64_t src_bound )
                            +
                            + +
                            +
                            + +

                            ◆ VEC_WIDTH() [1/2]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            indices_is_long &[is_long_idx] is_long_mask VEC_WIDTH (combined_indices ,
                            indices_addrs [list_id],
                            src_idx ,
                            indices_start+ src_idx,
                            indices_end - indices_start )
                            +
                            + +
                            +
                            + +

                            ◆ VEC_WIDTH() [2/2]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            lengths_is_long &[is_long_idx] is_long_mask VEC_WIDTH (combined_lengths ,
                            lengths_addrs [list_id],
                            src_idx ,
                            lengths_start+ src_idx,
                            lengths_end - lengths_start )
                            +
                            + +
                            +
                            + +

                            ◆ warp_find_qparams()

                            + +
                            +
                            +
                            +template<typename scalar_t >
                            + + + + + + + + + + + +
                            DEVICE_INLINE float2 warp_find_qparams (scalar_t local_min,
                            scalar_t local_max )
                            +
                            + +
                            +
                            + +

                            ◆ warp_reduce_max()

                            + +
                            +
                            +
                            +template<typename T , int ReduceWidth = kWarpSize>
                            + + + + + + + +
                            DEVICE_INLINE T warp_reduce_max (T val)
                            +
                            + +
                            +
                            + +

                            ◆ warp_reduce_min()

                            + +
                            +
                            +
                            +template<typename T , int ReduceWidth = kWarpSize>
                            + + + + + + + +
                            DEVICE_INLINE T warp_reduce_min (T val)
                            +
                            + +
                            +
                            + +

                            ◆ warpBitonicMergeLE16()

                            + +
                            +
                            +
                            +template<typename K , typename V , int32_t L, bool Dir, typename Comp , bool IsBitonic>
                            + + + + + +
                            + + + + + + + + + + + +
                            __device__ void warpBitonicMergeLE16 (K & k,
                            V & v )
                            +
                            +inline
                            +
                            + +
                            +
                            + +

                            ◆ warpReduceAllSum()

                            + +
                            +
                            +
                            +template<typename T , int ReduceWidth = kWarpSize>
                            + + + + + + + + + + + +
                            DEVICE_INLINE T warpReduceAllSum (T val,
                            unsigned shfl_sync_mask = kFullWarpMask )
                            +
                            + +

                            Sums a register value across all warp threads.

                            + +
                            +
                            + +

                            ◆ while()

                            + +
                            +
                            + + + + + + + +
                            while (left ! = right)
                            +
                            + +
                            +
                            + +

                            ◆ zipf_cuda()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            DLL_PUBLIC Tensor zipf_cuda (const double a,
                            const int64_t n,
                            const int64_t seed )
                            +
                            + +
                            +
                            + +

                            ◆ zipf_kernel()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            __global__ void zipf_kernel (const double a,
                            const int64_t seed,
                            at::PackedTensorAccessor64< long, 1, at::RestrictPtrTraits > y )
                            +
                            + +
                            +
                            +

                            Variable Documentation

                            + +

                            ◆ B

                            + +
                            +
                            + + + + +
                            __global__ int32_t B
                            +
                            + +
                            +
                            + +

                            ◆ b

                            + +
                            +
                            + + + + +
                            const auto b = blockIdx.x * blockDim.x + threadIdx.x
                            +
                            + +
                            +
                            + +

                            ◆ batch_size

                            + +
                            +
                            + + + + +
                            __global__ int batch_size
                            +
                            + +
                            +
                            + +

                            ◆ batch_size_offsets

                            + +
                            +
                            + +
                            +
                            + +

                            ◆ batch_size_per_feature

                            + +
                            +
                            + + + + +
                            __global__ const int const offset_t* const __restrict__ batch_size_per_feature
                            +
                            + +
                            +
                            + +

                            ◆ bin_boundaries

                            + + + +

                            ◆ bin_ctr_in_use_after

                            + +
                            +
                            + + + + +
                            __global__ const int64_t const int64_t const double const int64_t bin_ctr_in_use_after
                            +
                            + +
                            +
                            + +

                            ◆ bin_ctr_weight_value

                            + +
                            +
                            + +
                            +
                            + +

                            ◆ bin_ids_data

                            + +
                            +
                            +Initial value:
                            {
                            +
                            const int32_t index = blockIdx.x * blockDim.x + threadIdx.x
                            +
                            indices_is_long &[is_long_idx] is_long_mask int32_t
                            Definition input_combine.cu:73
                            +
                            +
                            +
                            + +

                            ◆ bin_num_examples_data

                            + + + +

                            ◆ bin_num_positives_data

                            + + + +

                            ◆ block_bucketize_pos_concat

                            + + + +

                            ◆ block_bucketize_pos_offsets

                            + + + +

                            ◆ block_sizes_data

                            + +
                            +
                            + + + + +
                            __global__ int32_t const index_t *__restrict__ block_sizes_data
                            +
                            + +
                            +
                            + +

                            ◆ calibrated_prediction_data

                            + + + +

                            ◆ combined_lengths

                            + +
                            +
                            + + + + +
                            __global__ int32_t* const __restrict__ combined_lengths
                            +
                            + +
                            +
                            + +

                            ◆ combined_weights

                            + +
                            +
                            + + + + +
                            __global__ int32_t* const __restrict__ float* const __restrict__ combined_weights
                            +
                            + +
                            +
                            + +

                            ◆ csr_seg_data

                            + +
                            +
                            + + + + +
                            __global__ int const int* csr_seg_data
                            +
                            + +
                            +
                            + +

                            ◆ curr_bin_id

                            + +
                            +
                            + + + + +
                            const int curr_bin_id = left
                            +
                            + +
                            +
                            + +

                            ◆ curr_bin_num_examples

                            + +
                            +
                            + + + + +
                            const auto curr_bin_num_examples = bin_num_examples_data[bin_ids_data[index]]
                            +
                            + +
                            +
                            + +

                            ◆ curr_offset

                            + +
                            +
                            + + + + +
                            const auto curr_offset = segment_offsets_data[index]
                            +
                            + +
                            +
                            + +

                            ◆ curr_segment_value

                            + +
                            +
                            + + + + +
                            const int64_t curr_segment_value
                            +
                            +Initial value:
                            =
                            + +
                            ? 0
                            +
                            : std::max(0L, dense_segment_value_data[index] * num_bins)
                            +
                            uint32_t L
                            Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:744
                            +
                            __global__ const int64_t const int64_t num_segments
                            Definition histogram_binning_calibration_ops.cu:135
                            +
                            __global__ const ValueType *const const OffsetType *const ValueType *const dense_segment_value_data
                            Definition histogram_binning_calibration_ops.cu:113
                            +
                            +
                            +
                            + +

                            ◆ dense_segment_value_data

                            + +
                            +
                            +Initial value:
                            {
                            +
                            const int32_t index = blockIdx.x * blockDim.x + threadIdx.x
                            +
                            +
                            +
                            + +

                            ◆ else

                            + +
                            +
                            + + + + +
                            else
                            +
                            +Initial value:
                            {
                            + +
                            __global__ const int64_t const double const double const int64_t const double const T *const const double *const const double *const T *const calibrated_prediction_data
                            Definition histogram_binning_calibration_ops.cu:31
                            +
                            const double uncalibrated
                            Definition histogram_binning_calibration_ops.cu:39
                            +
                            +
                            +
                            + +

                            ◆ fd_num_warps_per_list

                            + +
                            +
                            +Initial value:
                            {
                            +
                            const auto global_warp_id = blockIdx.x * blockDim.y + threadIdx.y
                            +
                            const int32_t global_warp_id
                            Definition gen_embedding_forward_split_unweighted_v2_kernel.cu:676
                            +
                            +
                            +
                            + +

                            ◆ grad_output

                            + +
                            +
                            + +
                            +
                            + +

                            ◆ grad_sum

                            + +
                            +
                            + + + + +
                            at::acc_type<scalar_t, true> grad_sum = 0.0
                            +
                            + +
                            +
                            + +

                            ◆ grad_weight

                            + +
                            +
                            + + + + +
                            grad_weight[n *sum_E+table_offset+idx] = grad_sum
                            +
                            + +
                            +
                            + +

                            ◆ GROUP_INDEX_SELECT_COLS_PER_WARP

                            + +
                            +
                            + + + + + +
                            + + + + +
                            constexpr int GROUP_INDEX_SELECT_COLS_PER_WARP
                            +
                            +constexpr
                            +
                            +Initial value:
                            =
                            + +
                            template __global__ kWarpSize
                            Definition gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu:1952
                            +
                            constexpr int GROUP_INDEX_SELECT_UNROLL_FACTOR
                            Definition sparse_group_index.cu:16
                            +
                            +
                            +
                            + +

                            ◆ GROUP_INDEX_SELECT_LOG_COLS_PER_WARP

                            + +
                            +
                            + + + + + +
                            + + + + +
                            constexpr int GROUP_INDEX_SELECT_LOG_COLS_PER_WARP
                            +
                            +constexpr
                            +
                            +Initial value:
                            =
                            + +
                            Definition sparse_ops_utils.h:535
                            +
                            +
                            +
                            + +

                            ◆ GROUP_INDEX_SELECT_UNROLL_FACTOR

                            + +
                            +
                            + + + + + +
                            + + + + +
                            constexpr int GROUP_INDEX_SELECT_UNROLL_FACTOR = 1
                            +
                            +constexpr
                            +
                            + +
                            +
                            + +

                            ◆ group_size

                            + +
                            +
                            +Initial value:
                            {
                            +
                            const auto total_num_warps = warp_offsets_group[group_size]
                            +
                            __global__ const int64_t const int64_t const int64_t * warp_offsets_group
                            Definition sparse_group_index.cu:41
                            +
                            __global__ const int64_t const int64_t const int64_t const int32_t const int64_t const int64_t group_size
                            Definition sparse_group_index.cu:44
                            +
                            +
                            +
                            + +

                            ◆ idx

                            + +
                            +
                            + + + + +
                            int64_t idx = linear_index - table_offset
                            +
                            + +
                            +
                            + +

                            ◆ indices

                            + +
                            +
                            + +
                            +
                            + +

                            ◆ indices_addrs

                            + + + +

                            ◆ indices_data

                            + +
                            +
                            + +
                            +
                            + +

                            ◆ indices_end

                            + +
                            +
                            + + + + +
                            index_t indices_end = indices_offsets[list_id + 1]
                            +
                            + +
                            +
                            + +

                            ◆ indices_is_long

                            + + + +

                            ◆ indices_offsets

                            + + + +

                            ◆ indices_ptrs

                            + +
                            +
                            + + + + +
                            __global__ const int64_t const int64_t* indices_ptrs
                            +
                            + +
                            +
                            + +

                            ◆ indices_start

                            + +
                            +
                            + + + + +
                            index_t indices_start = indices_offsets[list_id]
                            +
                            + +
                            +
                            + +

                            ◆ indices_to_lb

                            + + + +

                            ◆ info

                            + +
                            +
                            + + + + +
                            const auto info
                            +
                            +Initial value:
                            =
                            +
                            reinterpret_cast<const uint32_t*>(sorted_infos)[segment_start]
                            +
                            __global__ const int32_t const int32_t const scalar_t *__restrict__ const index_t *__restrict__ scalar_t *__restrict__ const at::PackedTensorAccessor32< index_t, 1, at::RestrictPtrTraits > const int32_t *__restrict__ const int32_t *__restrict__ sorted_infos
                            Definition sparse_batched_unary_embeddings.cu:126
                            +
                            +
                            +
                            + +

                            ◆ info_B_mask

                            + +
                            +
                            +Initial value:
                            {
                            +
                            int32_t run_id = blockIdx.x * blockDim.x + threadIdx.x
                            +
                            +
                            +
                            + +

                            ◆ info_B_num_bits

                            + + + +

                            ◆ input_offsets

                            + + + +

                            ◆ input_size

                            + +
                            +
                            + + + + +
                            __global__ const offsets_t *__restrict__ int32_t input_size
                            +
                            +Initial value:
                            {
                            +
                            const int i = blockDim.x * blockIdx.x + threadIdx.x
                            +
                            +
                            +
                            + +

                            ◆ int32_t

                            + +
                            +
                            + + + + +
                            lengths_is_long& [is_long_idx] is_long_mask int32_t
                            +
                            + +
                            +
                            + +

                            ◆ is_long_idx

                            + +
                            +
                            + + + + +
                            const uint32_t is_long_idx = list_id / IS_LONG_NUM_BITS
                            +
                            + +
                            +
                            + +

                            ◆ is_long_mask

                            + +
                            +
                            + + + + +
                            const uint32_t is_long_mask = 1u << (list_id % IS_LONG_NUM_BITS)
                            +
                            + +
                            +
                            + +

                            ◆ IS_LONG_NUM_BITS

                            + +
                            +
                            + + + + + +
                            + + + + +
                            constexpr uint32_t IS_LONG_NUM_BITS = 32
                            +
                            +constexpr
                            +
                            + +
                            +
                            + +

                            ◆ kCacheLocationMissing

                            + +
                            +
                            + + + + + +
                            + + + + +
                            constexpr int32_t kCacheLocationMissing = -1
                            +
                            +constexpr
                            +
                            + +
                            +
                            + +

                            ◆ L

                            + +
                            +
                            + + + + +
                            int32_t L = indices_end - indices_start
                            +
                            + +
                            +
                            + +

                            ◆ left

                            + +
                            +
                            + + + + +
                            int left = 0
                            +
                            + +
                            +
                            + +

                            ◆ length_to_feature_idx

                            + + + +

                            ◆ lengths

                            + +
                            +
                            + +
                            +
                            + +

                            ◆ lengths_addrs

                            + + + +

                            ◆ lengths_end

                            + +
                            +
                            + + + + +
                            const auto lengths_end = lengths_offsets[list_id + 1]
                            +
                            + +
                            +
                            + +

                            ◆ lengths_is_long

                            + + + +

                            ◆ lengths_offsets

                            + + + +

                            ◆ lengths_start

                            + +
                            +
                            + + + + +
                            const auto lengths_start = lengths_offsets[list_id]
                            +
                            + +
                            +
                            + +

                            ◆ linear_index

                            + +
                            +
                            + + + + +
                            int64_t linear_index = sorted_linear_indices_run[run_id]
                            +
                            + +
                            +
                            + +

                            ◆ list_id

                            + +
                            +
                            + + + + +
                            uint32_t list_id
                            +
                            + +
                            +
                            + +

                            ◆ logit_data

                            + + + +

                            ◆ MAX_ELEMENTS_PER_THREAD

                            + +
                            +
                            + + + + + +
                            + + + + +
                            constexpr int MAX_ELEMENTS_PER_THREAD = 4
                            +
                            +constexpr
                            +
                            + +
                            +
                            + +

                            ◆ my_size

                            + +
                            +
                            + + + + +
                            __global__ int my_size
                            +
                            + +
                            +
                            + +

                            ◆ n

                            + +
                            +
                            + + + + +
                            int32_t n = blockIdx.z
                            +
                            + +
                            +
                            + +

                            ◆ new_indices_data

                            + + + +

                            ◆ new_lengths_data

                            + +
                            +
                            +Initial value:
                            {
                            +
                            using uscalar_t = std::make_unsigned_t<scalar_t>
                            +
                            +
                            +
                            + +

                            ◆ new_offsets_data

                            + + + +

                            ◆ new_pos_data

                            + +
                            +
                            +Initial value:
                            {
                            +
                            using uindex_t = std::make_unsigned_t<index_t>
                            +
                            +
                            +
                            + +

                            ◆ new_weights_data

                            + + + +

                            ◆ next_offset

                            + +
                            +
                            + + + + +
                            const auto next_offset = segment_offsets_data[index + 1]
                            +
                            + +
                            +
                            + +

                            ◆ NUM_ARGS

                            + +
                            +
                            + + + + + +
                            + + + + +
                            constexpr uint32_t NUM_ARGS = 7
                            +
                            +constexpr
                            +
                            + +
                            +
                            + +

                            ◆ num_bins

                            + +
                            +
                            + + + + +
                            __global__ const int64_t num_bins
                            +
                            + +
                            +
                            + +

                            ◆ num_cols_group

                            + +
                            +
                            + +
                            +
                            + +

                            ◆ num_lists

                            + + + +

                            ◆ num_segments

                            + +
                            +
                            + + + + +
                            __global__ const int64_t const int64_t num_segments
                            +
                            + +
                            +
                            + +

                            ◆ num_work_rows

                            + + + +

                            ◆ offsets

                            + + + +

                            ◆ offsets_data

                            + +
                            +
                            + + + + +
                            __global__ int64_t const scalar_t *__restrict__ offsets_data
                            +
                            + +
                            +
                            + +

                            ◆ output

                            + +
                            +
                            + + + + +
                            __global__ int64_t * output
                            +
                            +Initial value:
                            {
                            +
                            index_t sum_E = table_offsets[T]
                            +
                            __launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta const emb_t *__restrict__ const const cache_t *__restrict__ const const int32_t *__restrict__ const const uint32_t const uint32_t T
                            Definition gen_embedding_forward_split_unweighted_codegen_cuda.cu:61
                            +
                            __global__ const int32_t const int32_t const scalar_t *__restrict__ const index_t *__restrict__ table_offsets
                            Definition sparse_batched_unary_embeddings.cu:24
                            +
                            index_t sum_E
                            Definition sparse_batched_unary_embeddings.cu:166
                            +
                            +
                            +
                            + +

                            ◆ output_data

                            + +
                            +
                            + + + + +
                            __global__ int const int const scalar_t scalar_t* output_data
                            +
                            +Initial value:
                            {
                            +
                            typedef FBGEMM_GPU_CUB_NS_PREFIX cub::BlockReduce<scalar_t, 256> BlockReduce
                            +
                            #define FBGEMM_GPU_CUB_NS_PREFIX
                            Definition cub_namespace_postfix.cuh:34
                            +
                            +
                            +
                            + +

                            ◆ output_offsets

                            + + + +

                            ◆ output_permute

                            + +
                            +
                            +Initial value:
                            {
                            +
                            const int32_t t_start = blockIdx.x * blockDim.y + threadIdx.y
                            +
                            +
                            +
                            + +

                            ◆ output_ptrs

                            + +
                            +
                            + + + + +
                            __global__ const int64_t* output_ptrs
                            +
                            + +
                            +
                            + +

                            ◆ per_sample_weights_addrs

                            + + + +

                            ◆ permute

                            + + + +

                            ◆ permuted_indices

                            + + + +

                            ◆ permuted_lengths_size

                            + +
                            +
                            + + + + +
                            __global__ int32_t permuted_lengths_size
                            +
                            + +
                            +
                            + +

                            ◆ permuted_weights

                            + +
                            +
                            +Initial value:
                            {
                            +
                            int32_t b_t_start = blockIdx.x * blockDim.y + threadIdx.y
                            +
                            +
                            +
                            + +

                            ◆ pre_sigmoid

                            + +
                            +
                            + + + + +
                            const LogitType pre_sigmoid = logit_data[index] + recalibrate_value
                            +
                            + +
                            +
                            + +

                            ◆ range_data

                            + +
                            +
                            +Initial value:
                            {
                            +
                            int start_row_idx = blockIdx.x * blockDim.y + threadIdx.y
                            +
                            +
                            +
                            + +

                            ◆ range_size

                            + +
                            +
                            + + + + +
                            __global__ int64_t range_size
                            +
                            + +
                            +
                            + +

                            ◆ recalibrate_value

                            + +
                            +
                            + + + + +
                            __global__ const int64_t const int64_t const double recalibrate_value
                            +
                            + +
                            +
                            + +

                            ◆ right

                            + +
                            +
                            + + + + +
                            int right = num_bins - 1
                            +
                            + +
                            +
                            + +

                            ◆ seg_end

                            + +
                            +
                            + + + + +
                            int seg_end = csr_seg_data[blockIdx.x + 1] * batch_size
                            +
                            + +
                            +
                            + +

                            ◆ seg_start

                            + +
                            +
                            + + + + +
                            int seg_start = csr_seg_data[blockIdx.x] * batch_size
                            +
                            + +
                            +
                            + +

                            ◆ segment_end

                            + +
                            +
                            + + + + +
                            int32_t segment_end
                            +
                            +Initial value:
                            =
                            + +
                            __global__ const int32_t const int32_t const scalar_t *__restrict__ const index_t *__restrict__ scalar_t *__restrict__ const at::PackedTensorAccessor32< index_t, 1, at::RestrictPtrTraits > const int32_t *__restrict__ sorted_linear_indices_cumulative_run_lengths
                            Definition sparse_batched_unary_embeddings.cu:125
                            +
                            +
                            +
                            + +

                            ◆ segment_offsets_data

                            + +
                            +
                            + + + + +
                            __global__ const ValueType* const const OffsetType* const segment_offsets_data
                            +
                            + +
                            +
                            + +

                            ◆ segment_start

                            + +
                            +
                            + +
                            +
                            + +

                            ◆ segment_value_data

                            + +
                            +
                            + + + + +
                            __global__ const ValueType* const segment_value_data
                            +
                            + +
                            +
                            + +

                            ◆ SL

                            + +
                            +
                            + + + + +
                            int32_t SL = segment_end - segment_start
                            +
                            + +
                            +
                            + +

                            ◆ sorted_infos

                            + +
                            +
                            + +
                            +
                            + +

                            ◆ sorted_linear_indices_cumulative_run_lengths

                            + +
                            +
                            + + + + +
                            __global__ const int32_t const int32_t const scalar_t* __restrict__ const index_t* __restrict__ scalar_t* __restrict__ const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> const int32_t* __restrict__ sorted_linear_indices_cumulative_run_lengths
                            +
                            + +
                            +
                            + +

                            ◆ sorted_linear_indices_num_runs

                            + +
                            +
                            + + + + +
                            __global__ const int32_t const int32_t const scalar_t* __restrict__ const index_t* __restrict__ scalar_t* __restrict__ const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> const int32_t* __restrict__ const int32_t* __restrict__ const int32_t* __restrict__ sorted_linear_indices_num_runs
                            +
                            + +
                            +
                            + +

                            ◆ sorted_linear_indices_run

                            + +
                            +
                            + + + + +
                            __global__ const int32_t const int32_t const scalar_t* __restrict__ const index_t* __restrict__ scalar_t* __restrict__ const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> sorted_linear_indices_run
                            +
                            + +
                            +
                            + +

                            ◆ src_idx

                            + +
                            +
                            + + + + +
                            const uint64_t src_idx = (warp_id * kWarpSize + threadIdx.x) * VEC_WIDTH
                            +
                            + +
                            +
                            + +

                            ◆ start_input

                            + +
                            +
                            + + + + +
                            __global__ int64_t index_t start_input
                            +
                            + +
                            +
                            + +

                            ◆ step

                            + + + +

                            ◆ stride

                            + +
                            +
                            + + + + +
                            const int stride = gridDim.x * blockDim.y
                            +
                            + +
                            +
                            + +

                            ◆ sum

                            + +
                            +
                            + + + + +
                            scalar_t sum = 0.0
                            +
                            + +
                            +
                            + +

                            ◆ sum_E

                            + +
                            +
                            + + + + +
                            index_t sum_E = table_offsets[T]
                            +
                            + +
                            +
                            + +

                            ◆ T

                            + +
                            +
                            + + + + +
                            __global__ int32_t T
                            +
                            + +
                            +
                            + +

                            ◆ t

                            + +
                            +
                            + + + + +
                            const auto t = blockIdx.y
                            +
                            + +
                            +
                            + +

                            ◆ table_offset

                            + +
                            +
                            + + + + +
                            index_t table_offset = table_offsets[t]
                            +
                            + +
                            +
                            + +

                            ◆ table_offsets

                            + + + +

                            ◆ temp_storage

                            + +
                            +
                            + + + + +
                            __shared__ BlockReduce::TempStorage temp_storage
                            +
                            + +
                            +
                            + +

                            ◆ unbucketize_permute_data

                            + + + +

                            ◆ uncalibrated

                            + +
                            +
                            + + + + +
                            const double uncalibrated = 1.0 / (1.0 + exp(-pre_sigmoid))
                            +
                            + +
                            +
                            + +

                            ◆ values_data

                            + +
                            +
                            + + + + +
                            __global__ int const int const scalar_t* values_data
                            +
                            + +
                            +
                            + +

                            ◆ vec_copy_with_implicit_type_cast< int64_t, int32_t, VEC_WIDTH >

                            + + + +

                            ◆ warp_id

                            + +
                            +
                            + + + + +
                            uint32_t warp_id
                            +
                            + +
                            +
                            + +

                            ◆ warp_offsets_group

                            + +
                            +
                            + + + + +
                            __global__ const int64_t const int64_t const int64_t* warp_offsets_group
                            +
                            + +
                            +
                            + +

                            ◆ weight

                            + + + +

                            ◆ weights

                            + + + +

                            ◆ weights_data

                            + + +
                            + + + + diff --git a/namespaceinternal.html b/namespaceinternal.html new file mode 100644 index 000000000..6cc297d16 --- /dev/null +++ b/namespaceinternal.html @@ -0,0 +1,247 @@ + + + + + + + +fbgemm_gpu: internal Namespace Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            + +
                            internal Namespace Reference
                            +
                            +
                            + + + + +

                            +Classes

                            struct  HyperCompressedSparseColumn
                             
                            +

                            Function Documentation

                            + +

                            ◆ csr2csc()

                            + +
                            +
                            +
                            +template<typename scalar_t >
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            void csr2csc (HyperCompressedSparseColumn & csc,
                            int B,
                            const at::TensorAccessor< int64_t, 1 > & csr_offsets,
                            const at::TensorAccessor< int64_t, 1 > & csr_indices,
                            const at::TensorAccessor< scalar_t, 1 > & csr_weights,
                            int64_t pooling_mode,
                            const int * table_to_feature_offset,
                            int64_t num_embeddings )
                            +
                            + +
                            +
                            + +

                            ◆ csr2csc< double >()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template void csr2csc< double > (HyperCompressedSparseColumn & csc,
                            int B,
                            const at::TensorAccessor< int64_t, 1 > & csr_offsets,
                            const at::TensorAccessor< int64_t, 1 > & csr_indices,
                            const at::TensorAccessor< double, 1 > & csr_weights,
                            int64_t pooling_mode,
                            const int * table_to_feature_offset,
                            int64_t num_embeddings )
                            +
                            + +
                            +
                            + +

                            ◆ csr2csc< float >()

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template void csr2csc< float > (HyperCompressedSparseColumn & csc,
                            int B,
                            const at::TensorAccessor< int64_t, 1 > & csr_offsets,
                            const at::TensorAccessor< int64_t, 1 > & csr_indices,
                            const at::TensorAccessor< float, 1 > & csr_weights,
                            int64_t pooling_mode,
                            const int * table_to_feature_offset,
                            int64_t num_embeddings )
                            +
                            + +
                            +
                            +
                            + + + + diff --git a/namespacemembers.html b/namespacemembers.html new file mode 100644 index 000000000..dc9101907 --- /dev/null +++ b/namespacemembers.html @@ -0,0 +1,146 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - _ -

                            +
                            + + + + diff --git a/namespacemembers_a.html b/namespacemembers_a.html new file mode 100644 index 000000000..90bb84329 --- /dev/null +++ b/namespacemembers_a.html @@ -0,0 +1,108 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - a -

                            +
                            + + + + diff --git a/namespacemembers_b.html b/namespacemembers_b.html new file mode 100644 index 000000000..11f0a594d --- /dev/null +++ b/namespacemembers_b.html @@ -0,0 +1,115 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - b -

                            +
                            + + + + diff --git a/namespacemembers_c.html b/namespacemembers_c.html new file mode 100644 index 000000000..77166d701 --- /dev/null +++ b/namespacemembers_c.html @@ -0,0 +1,107 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - c -

                            +
                            + + + + diff --git a/namespacemembers_d.html b/namespacemembers_d.html new file mode 100644 index 000000000..1cf65b596 --- /dev/null +++ b/namespacemembers_d.html @@ -0,0 +1,99 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - d -

                            +
                            + + + + diff --git a/namespacemembers_e.html b/namespacemembers_e.html new file mode 100644 index 000000000..606e30650 --- /dev/null +++ b/namespacemembers_e.html @@ -0,0 +1,95 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - e -

                            +
                            + + + + diff --git a/namespacemembers_enum.html b/namespacemembers_enum.html new file mode 100644 index 000000000..d085167d3 --- /dev/null +++ b/namespacemembers_enum.html @@ -0,0 +1,89 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace enums with links to the namespace documentation for each enum:
                            +
                            + + + + diff --git a/namespacemembers_eval.html b/namespacemembers_eval.html new file mode 100644 index 000000000..a275d4360 --- /dev/null +++ b/namespacemembers_eval.html @@ -0,0 +1,95 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace enum values with links to the namespace documentation for each enum value:
                            +
                            + + + + diff --git a/namespacemembers_f.html b/namespacemembers_f.html new file mode 100644 index 000000000..22f082fbd --- /dev/null +++ b/namespacemembers_f.html @@ -0,0 +1,119 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - f -

                            +
                            + + + + diff --git a/namespacemembers_func.html b/namespacemembers_func.html new file mode 100644 index 000000000..c3b358eb1 --- /dev/null +++ b/namespacemembers_func.html @@ -0,0 +1,146 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - _ -

                            +
                            + + + + diff --git a/namespacemembers_func_a.html b/namespacemembers_func_a.html new file mode 100644 index 000000000..9eccf05ca --- /dev/null +++ b/namespacemembers_func_a.html @@ -0,0 +1,107 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - a -

                            +
                            + + + + diff --git a/namespacemembers_func_b.html b/namespacemembers_func_b.html new file mode 100644 index 000000000..477e85b54 --- /dev/null +++ b/namespacemembers_func_b.html @@ -0,0 +1,100 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - b -

                              +
                            • ballot_sync() : fbgemm_gpu
                            • +
                            • batch_auc() : fbgemm_gpu
                            • +
                            • batched_dense_vec_jagged_2d_mul() : fbgemm_gpu
                            • +
                            • batched_dense_vec_jagged_2d_mul_backward() : fbgemm_gpu
                            • +
                            • batched_dense_vec_jagged_2d_mul_backward_meta() : fbgemm_gpu
                            • +
                            • batched_dense_vec_jagged_2d_mul_forward() : fbgemm_gpu
                            • +
                            • batched_dense_vec_jagged_2d_mul_forward_meta() : fbgemm_gpu
                            • +
                            • batched_unary_embeddings_backward_cuda() : fbgemm_gpu
                            • +
                            • batched_unary_embeddings_forward_cpu() : fbgemm_gpu
                            • +
                            • batched_unary_embeddings_forward_cuda() : fbgemm_gpu
                            • +
                            • BFloat16QuantizedToFloat_ref() : fbgemm_gpu
                            • +
                            • binary_search_range() : fbgemm_gpu
                            • +
                            • block_bucketize_sparse_features_cpu() : fbgemm_gpu
                            • +
                            • block_bucketize_sparse_features_cuda() : fbgemm_gpu
                            • +
                            • bucketize_sparse_features_cpu() : fbgemm_gpu
                            • +
                            • bucketize_sparse_features_cuda() : fbgemm_gpu
                            • +
                            +
                            + + + + diff --git a/namespacemembers_func_c.html b/namespacemembers_func_c.html new file mode 100644 index 000000000..06de7563c --- /dev/null +++ b/namespacemembers_func_c.html @@ -0,0 +1,99 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - c -

                              +
                            • calc_offsets_range_thread_block() : fbgemm_gpu
                            • +
                            • cat_reorder_batched_ad_indices_cpu() : fbgemm_gpu
                            • +
                            • cat_reorder_batched_ad_indices_cpu_() : fbgemm_gpu
                            • +
                            • compute_frequency_sequence() : fbgemm_gpu
                            • +
                            • compute_num_uint64s() : fbgemm_gpu
                            • +
                            • cp_async_fence() : nbit
                            • +
                            • cp_async_wait() : nbit
                            • +
                            • cp_async_wait< 0 >() : nbit
                            • +
                            • cp_async_zfill() : nbit
                            • +
                            • cp_async_zfill_cg() : nbit
                            • +
                            • csr2csc() : internal
                            • +
                            • csr2csc< double >() : internal
                            • +
                            • csr2csc< float >() : internal
                            • +
                            • CUDA_KERNEL_LOOP() : fbgemm_gpu
                            • +
                            • cutlass_get_smem_pointer() : nbit
                            • +
                            +
                            + + + + diff --git a/namespacemembers_func_d.html b/namespacemembers_func_d.html new file mode 100644 index 000000000..385d821f3 --- /dev/null +++ b/namespacemembers_func_d.html @@ -0,0 +1,98 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - d -

                            +
                            + + + + diff --git a/namespacemembers_func_e.html b/namespacemembers_func_e.html new file mode 100644 index 000000000..51cf8d20a --- /dev/null +++ b/namespacemembers_func_e.html @@ -0,0 +1,91 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - e -

                            +
                            + + + + diff --git a/namespacemembers_func_f.html b/namespacemembers_func_f.html new file mode 100644 index 000000000..c5b57cbd9 --- /dev/null +++ b/namespacemembers_func_f.html @@ -0,0 +1,116 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - f -

                            +
                            + + + + diff --git a/namespacemembers_func_g.html b/namespacemembers_func_g.html new file mode 100644 index 000000000..7042da53b --- /dev/null +++ b/namespacemembers_func_g.html @@ -0,0 +1,96 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - g -

                              +
                            • generic_histogram_binning_calibration_by_feature_cpu() : fbgemm_gpu
                            • +
                            • generic_histogram_binning_calibration_by_feature_cuda() : fbgemm_gpu
                            • +
                            • get_group_index_select_cols_per_warp() : fbgemm_gpu
                            • +
                            • get_nvlink_matrix() : fbgemm_gpu
                            • +
                            • getScalarType() : fbgemm_gpu
                            • +
                            • getSparseType() : fbgemm_gpu
                            • +
                            • group_index_select_dim0_gpu() : fbgemm_gpu
                            • +
                            • group_index_select_dim0_gpu_backward_meta() : fbgemm_gpu
                            • +
                            • group_index_select_dim0_gpu_impl() : fbgemm_gpu
                            • +
                            • group_index_select_dim0_gpu_impl_meta() : fbgemm_gpu
                            • +
                            • group_index_select_dim0_unpack() : fbgemm_gpu
                            • +
                            • group_index_select_or_add_cuda() : fbgemm_gpu
                            • +
                            +
                            + + + + diff --git a/namespacemembers_func_h.html b/namespacemembers_func_h.html new file mode 100644 index 000000000..cea2ce692 --- /dev/null +++ b/namespacemembers_func_h.html @@ -0,0 +1,95 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - h -

                            +
                            + + + + diff --git a/namespacemembers_func_i.html b/namespacemembers_func_i.html new file mode 100644 index 000000000..1f5f8b730 --- /dev/null +++ b/namespacemembers_func_i.html @@ -0,0 +1,93 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - i -

                            +
                            + + + + diff --git a/namespacemembers_func_j.html b/namespacemembers_func_j.html new file mode 100644 index 000000000..1432488f0 --- /dev/null +++ b/namespacemembers_func_j.html @@ -0,0 +1,151 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - j -

                              +
                            • jagged_1d_to_dense() : fbgemm_gpu
                            • +
                            • jagged_1d_to_dense_meta() : fbgemm_gpu
                            • +
                            • jagged_2d_to_dense() : fbgemm_gpu
                            • +
                            • jagged_2d_to_dense_forward_cpu() : fbgemm_gpu
                            • +
                            • jagged_2d_to_dense_gpu_backward() : fbgemm_gpu
                            • +
                            • jagged_2d_to_dense_gpu_forward() : fbgemm_gpu
                            • +
                            • jagged_2d_to_dense_meta() : fbgemm_gpu
                            • +
                            • jagged_dense_bmm() : fbgemm_gpu
                            • +
                            • jagged_dense_bmm_forward() : fbgemm_gpu
                            • +
                            • jagged_dense_bmm_forward_cuda() : fbgemm_gpu
                            • +
                            • jagged_dense_bmm_forward_meta() : fbgemm_gpu
                            • +
                            • jagged_dense_bmm_kernel() : fbgemm_gpu
                            • +
                            • jagged_dense_dense_elementwise_add_jagged_output() : fbgemm_gpu
                            • +
                            • jagged_dense_dense_elementwise_add_jagged_output_forward() : fbgemm_gpu
                            • +
                            • jagged_dense_dense_elementwise_add_jagged_output_forward_meta() : fbgemm_gpu
                            • +
                            • jagged_dense_dense_elementwise_add_jagged_output_meta() : fbgemm_gpu
                            • +
                            • jagged_dense_dense_elementwise_jagged_output_() : fbgemm_gpu
                            • +
                            • jagged_dense_dense_elementwise_jagged_output_matches_opt() : fbgemm_gpu
                            • +
                            • jagged_dense_dense_elementwise_jagged_output_opt_() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_add() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_add_jagged_output() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_add_jagged_output_cuda() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_add_jagged_output_meta() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_add_meta() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_jagged_output_() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_jagged_output_opt_() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_mul() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_mul_backward() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_mul_backward_meta() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_mul_forward() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_mul_forward_meta() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_mul_meta() : fbgemm_gpu
                            • +
                            • jagged_hash_size_cumsum_cuda() : fbgemm_gpu
                            • +
                            • jagged_index_add_2d_forward_cpu() : fbgemm_gpu
                            • +
                            • jagged_index_add_2d_forward_cuda() : fbgemm_gpu
                            • +
                            • jagged_index_add_2d_forward_v2_impl() : fbgemm_gpu
                            • +
                            • jagged_index_add_2d_kernel() : fbgemm_gpu
                            • +
                            • jagged_index_select_2d() : fbgemm_gpu
                            • +
                            • jagged_index_select_2d_forward_cpu() : fbgemm_gpu
                            • +
                            • jagged_index_select_2d_forward_cuda() : fbgemm_gpu
                            • +
                            • jagged_index_select_2d_forward_v2_impl() : fbgemm_gpu
                            • +
                            • jagged_index_select_2d_kernel() : fbgemm_gpu
                            • +
                            • jagged_jagged_bmm() : fbgemm_gpu
                            • +
                            • jagged_jagged_bmm_forward() : fbgemm_gpu
                            • +
                            • jagged_jagged_bmm_forward_cuda() : fbgemm_gpu
                            • +
                            • jagged_jagged_bmm_forward_meta() : fbgemm_gpu
                            • +
                            • jagged_jagged_bmm_kernel() : fbgemm_gpu
                            • +
                            • jagged_jagged_elementwise_dense_output_() : fbgemm_gpu
                            • +
                            • jagged_slice() : fbgemm_gpu
                            • +
                            • jagged_slice_forward_cpu() : fbgemm_gpu
                            • +
                            • jagged_slice_forward_cpu_kernel() : fbgemm_gpu
                            • +
                            • jagged_softmax() : fbgemm_gpu
                            • +
                            • jagged_softmax_backward() : fbgemm_gpu
                            • +
                            • jagged_softmax_backward_cuda() : fbgemm_gpu
                            • +
                            • jagged_softmax_backward_kernel() : fbgemm_gpu
                            • +
                            • jagged_softmax_backward_meta() : fbgemm_gpu
                            • +
                            • jagged_softmax_forward() : fbgemm_gpu
                            • +
                            • jagged_softmax_forward_cuda() : fbgemm_gpu
                            • +
                            • jagged_softmax_forward_meta() : fbgemm_gpu
                            • +
                            • jagged_softmax_kernel() : fbgemm_gpu
                            • +
                            • jagged_to_padded_dense() : fbgemm_gpu
                            • +
                            • jagged_to_padded_dense_backward() : fbgemm_gpu
                            • +
                            • jagged_to_padded_dense_backward_meta() : fbgemm_gpu
                            • +
                            • jagged_to_padded_dense_forward() : fbgemm_gpu
                            • +
                            • jagged_to_padded_dense_forward_meta() : fbgemm_gpu
                            • +
                            • jagged_to_padded_dense_meta() : fbgemm_gpu
                            • +
                            • jagged_unique_indices_cuda() : fbgemm_gpu
                            • +
                            +
                            + + + + diff --git a/namespacemembers_func_k.html b/namespacemembers_func_k.html new file mode 100644 index 000000000..3ca157ba8 --- /dev/null +++ b/namespacemembers_func_k.html @@ -0,0 +1,87 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - k -

                              +
                            • keyed_jagged_index_add_dim1_kernel() : fbgemm_gpu
                            • +
                            • keyed_jagged_index_select_dim1_kernel() : fbgemm_gpu
                            • +
                            • keyed_jagged_index_select_dim_1_gpu() : fbgemm_gpu
                            • +
                            +
                            + + + + diff --git a/namespacemembers_func_l.html b/namespacemembers_func_l.html new file mode 100644 index 000000000..b98fe6047 --- /dev/null +++ b/namespacemembers_func_l.html @@ -0,0 +1,96 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - l -

                            +
                            + + + + diff --git a/namespacemembers_func_m.html b/namespacemembers_func_m.html new file mode 100644 index 000000000..4f902ab1a --- /dev/null +++ b/namespacemembers_func_m.html @@ -0,0 +1,93 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - m -

                            +
                            + + + + diff --git a/namespacemembers_func_n.html b/namespacemembers_func_n.html new file mode 100644 index 000000000..c1f8923ba --- /dev/null +++ b/namespacemembers_func_n.html @@ -0,0 +1,92 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - n -

                            +
                            + + + + diff --git a/namespacemembers_func_o.html b/namespacemembers_func_o.html new file mode 100644 index 000000000..6320f49aa --- /dev/null +++ b/namespacemembers_func_o.html @@ -0,0 +1,87 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - o -

                            +
                            + + + + diff --git a/namespacemembers_func_p.html b/namespacemembers_func_p.html new file mode 100644 index 000000000..27ba487e6 --- /dev/null +++ b/namespacemembers_func_p.html @@ -0,0 +1,131 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - p -

                              +
                            • pack_segments_autograd() : fbgemm_gpu
                            • +
                            • pack_segments_backward_cpu() : fbgemm_gpu
                            • +
                            • pack_segments_backward_cuda() : fbgemm_gpu
                            • +
                            • pack_segments_cpu() : fbgemm_gpu
                            • +
                            • pack_segments_cuda() : fbgemm_gpu
                            • +
                            • pack_segments_cuda_kernel() : fbgemm_gpu
                            • +
                            • pack_segments_forward_cpu() : fbgemm_gpu
                            • +
                            • pack_segments_forward_cuda() : fbgemm_gpu
                            • +
                            • padded_D() : nbit
                            • +
                            • padded_row_size_in_bytes() : nbit
                            • +
                            • padding_fused_tbe_input_combine_cpu() : fbgemm_gpu
                            • +
                            • padding_fused_tbe_input_combine_with_length_cpu() : fbgemm_gpu
                            • +
                            • permute102_baddbmm_permute102_cpu() : fbgemm_gpu
                            • +
                            • permute102_baddbmm_permute102_cuda() : fbgemm_gpu
                            • +
                            • permute_1D_sparse_data_cpu() : fbgemm_gpu
                            • +
                            • permute_2D_sparse_data_cpu() : fbgemm_gpu
                            • +
                            • permute_duplicate_pooled_embs_auto_grad_cpu() : fbgemm_gpu
                            • +
                            • permute_duplicate_pooled_embs_auto_grad_gpu() : fbgemm_gpu
                            • +
                            • permute_duplicate_pooled_embs_auto_grad_split_cpu() : fbgemm_gpu
                            • +
                            • permute_duplicate_pooled_embs_auto_grad_split_gpu() : fbgemm_gpu
                            • +
                            • permute_duplicate_pooled_embs_cpu() : fbgemm_gpu
                            • +
                            • permute_duplicate_pooled_embs_gpu() : fbgemm_gpu
                            • +
                            • permute_duplicate_pooled_embs_split_cpu() : fbgemm_gpu
                            • +
                            • permute_duplicate_pooled_embs_split_gpu() : fbgemm_gpu
                            • +
                            • permute_embeddings_kernel() : fbgemm_gpu
                            • +
                            • permute_pooled_embs_auto_grad() : fbgemm_gpu
                            • +
                            • permute_pooled_embs_auto_grad_cpu() : fbgemm_gpu
                            • +
                            • permute_pooled_embs_auto_grad_gpu() : fbgemm_gpu
                            • +
                            • permute_pooled_embs_auto_grad_meta() : fbgemm_gpu
                            • +
                            • permute_pooled_embs_auto_grad_split_cpu() : fbgemm_gpu
                            • +
                            • permute_pooled_embs_auto_grad_split_gpu() : fbgemm_gpu
                            • +
                            • permute_pooled_embs_cpu() : fbgemm_gpu
                            • +
                            • permute_pooled_embs_cpu_impl() : fbgemm_gpu
                            • +
                            • permute_pooled_embs_gpu() : fbgemm_gpu
                            • +
                            • permute_pooled_embs_gpu_impl() : fbgemm_gpu
                            • +
                            • permute_pooled_embs_meta() : fbgemm_gpu
                            • +
                            • permute_pooled_embs_split_cpu() : fbgemm_gpu
                            • +
                            • permute_pooled_embs_split_cpu_impl() : fbgemm_gpu
                            • +
                            • permute_pooled_embs_split_gpu() : fbgemm_gpu
                            • +
                            • permute_pooled_embs_split_gpu_impl() : fbgemm_gpu
                            • +
                            • permute_sequence_embeddings_cpu() : fbgemm_gpu
                            • +
                            • permute_sequence_embeddings_cuda() : fbgemm_gpu
                            • +
                            • permute_sparse_features_cpu() : fbgemm_gpu
                            • +
                            • prefix_sum() : fbgemm_gpu
                            • +
                            • pruned_array_lookup_from_row_idx_cpu() : fbgemm_gpu
                            • +
                            • pruned_array_lookup_from_row_idx_cuda() : fbgemm_gpu
                            • +
                            • pruned_hash_function() : nbit
                            • +
                            +
                            + + + + diff --git a/namespacemembers_func_q.html b/namespacemembers_func_q.html new file mode 100644 index 000000000..9aa90954f --- /dev/null +++ b/namespacemembers_func_q.html @@ -0,0 +1,85 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - q -

                            +
                            + + + + diff --git a/namespacemembers_func_r.html b/namespacemembers_func_r.html new file mode 100644 index 000000000..b27efdfd6 --- /dev/null +++ b/namespacemembers_func_r.html @@ -0,0 +1,101 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - r -

                            +
                            + + + + diff --git a/namespacemembers_func_s.html b/namespacemembers_func_s.html new file mode 100644 index 000000000..caad1e57b --- /dev/null +++ b/namespacemembers_func_s.html @@ -0,0 +1,103 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - s -

                            +
                            + + + + diff --git a/namespacemembers_func_t.html b/namespacemembers_func_t.html new file mode 100644 index 000000000..614096d85 --- /dev/null +++ b/namespacemembers_func_t.html @@ -0,0 +1,103 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - t -

                            +
                            + + + + diff --git a/namespacemembers_func_u.html b/namespacemembers_func_u.html new file mode 100644 index 000000000..f1054d0c9 --- /dev/null +++ b/namespacemembers_func_u.html @@ -0,0 +1,93 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - u -

                            +
                            + + + + diff --git a/namespacemembers_func_v.html b/namespacemembers_func_v.html new file mode 100644 index 000000000..4b2c42a75 --- /dev/null +++ b/namespacemembers_func_v.html @@ -0,0 +1,89 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - v -

                            +
                            + + + + diff --git a/namespacemembers_func_w.html b/namespacemembers_func_w.html new file mode 100644 index 000000000..27330bccc --- /dev/null +++ b/namespacemembers_func_w.html @@ -0,0 +1,90 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - w -

                            +
                            + + + + diff --git a/namespacemembers_func_z.html b/namespacemembers_func_z.html new file mode 100644 index 000000000..b6eb0c8d1 --- /dev/null +++ b/namespacemembers_func_z.html @@ -0,0 +1,86 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace functions with links to the namespace documentation for each function:
                            + +

                            - z -

                            +
                            + + + + diff --git a/namespacemembers_g.html b/namespacemembers_g.html new file mode 100644 index 000000000..112f72911 --- /dev/null +++ b/namespacemembers_g.html @@ -0,0 +1,103 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - g -

                            +
                            + + + + diff --git a/namespacemembers_h.html b/namespacemembers_h.html new file mode 100644 index 000000000..35020138b --- /dev/null +++ b/namespacemembers_h.html @@ -0,0 +1,95 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - h -

                            +
                            + + + + diff --git a/namespacemembers_i.html b/namespacemembers_i.html new file mode 100644 index 000000000..29a82d4d9 --- /dev/null +++ b/namespacemembers_i.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - i -

                            +
                            + + + + diff --git a/namespacemembers_j.html b/namespacemembers_j.html new file mode 100644 index 000000000..0589f30cf --- /dev/null +++ b/namespacemembers_j.html @@ -0,0 +1,151 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - j -

                              +
                            • jagged_1d_to_dense() : fbgemm_gpu
                            • +
                            • jagged_1d_to_dense_meta() : fbgemm_gpu
                            • +
                            • jagged_2d_to_dense() : fbgemm_gpu
                            • +
                            • jagged_2d_to_dense_forward_cpu() : fbgemm_gpu
                            • +
                            • jagged_2d_to_dense_gpu_backward() : fbgemm_gpu
                            • +
                            • jagged_2d_to_dense_gpu_forward() : fbgemm_gpu
                            • +
                            • jagged_2d_to_dense_meta() : fbgemm_gpu
                            • +
                            • jagged_dense_bmm() : fbgemm_gpu
                            • +
                            • jagged_dense_bmm_forward() : fbgemm_gpu
                            • +
                            • jagged_dense_bmm_forward_cuda() : fbgemm_gpu
                            • +
                            • jagged_dense_bmm_forward_meta() : fbgemm_gpu
                            • +
                            • jagged_dense_bmm_kernel() : fbgemm_gpu
                            • +
                            • jagged_dense_dense_elementwise_add_jagged_output() : fbgemm_gpu
                            • +
                            • jagged_dense_dense_elementwise_add_jagged_output_forward() : fbgemm_gpu
                            • +
                            • jagged_dense_dense_elementwise_add_jagged_output_forward_meta() : fbgemm_gpu
                            • +
                            • jagged_dense_dense_elementwise_add_jagged_output_meta() : fbgemm_gpu
                            • +
                            • jagged_dense_dense_elementwise_jagged_output_() : fbgemm_gpu
                            • +
                            • jagged_dense_dense_elementwise_jagged_output_matches_opt() : fbgemm_gpu
                            • +
                            • jagged_dense_dense_elementwise_jagged_output_opt_() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_add() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_add_jagged_output() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_add_jagged_output_cuda() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_add_jagged_output_meta() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_add_meta() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_jagged_output_() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_jagged_output_opt_() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_mul() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_mul_backward() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_mul_backward_meta() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_mul_forward() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_mul_forward_meta() : fbgemm_gpu
                            • +
                            • jagged_dense_elementwise_mul_meta() : fbgemm_gpu
                            • +
                            • jagged_hash_size_cumsum_cuda() : fbgemm_gpu
                            • +
                            • jagged_index_add_2d_forward_cpu() : fbgemm_gpu
                            • +
                            • jagged_index_add_2d_forward_cuda() : fbgemm_gpu
                            • +
                            • jagged_index_add_2d_forward_v2_impl() : fbgemm_gpu
                            • +
                            • jagged_index_add_2d_kernel() : fbgemm_gpu
                            • +
                            • jagged_index_select_2d() : fbgemm_gpu
                            • +
                            • jagged_index_select_2d_forward_cpu() : fbgemm_gpu
                            • +
                            • jagged_index_select_2d_forward_cuda() : fbgemm_gpu
                            • +
                            • jagged_index_select_2d_forward_v2_impl() : fbgemm_gpu
                            • +
                            • jagged_index_select_2d_kernel() : fbgemm_gpu
                            • +
                            • jagged_jagged_bmm() : fbgemm_gpu
                            • +
                            • jagged_jagged_bmm_forward() : fbgemm_gpu
                            • +
                            • jagged_jagged_bmm_forward_cuda() : fbgemm_gpu
                            • +
                            • jagged_jagged_bmm_forward_meta() : fbgemm_gpu
                            • +
                            • jagged_jagged_bmm_kernel() : fbgemm_gpu
                            • +
                            • jagged_jagged_elementwise_dense_output_() : fbgemm_gpu
                            • +
                            • jagged_slice() : fbgemm_gpu
                            • +
                            • jagged_slice_forward_cpu() : fbgemm_gpu
                            • +
                            • jagged_slice_forward_cpu_kernel() : fbgemm_gpu
                            • +
                            • jagged_softmax() : fbgemm_gpu
                            • +
                            • jagged_softmax_backward() : fbgemm_gpu
                            • +
                            • jagged_softmax_backward_cuda() : fbgemm_gpu
                            • +
                            • jagged_softmax_backward_kernel() : fbgemm_gpu
                            • +
                            • jagged_softmax_backward_meta() : fbgemm_gpu
                            • +
                            • jagged_softmax_forward() : fbgemm_gpu
                            • +
                            • jagged_softmax_forward_cuda() : fbgemm_gpu
                            • +
                            • jagged_softmax_forward_meta() : fbgemm_gpu
                            • +
                            • jagged_softmax_kernel() : fbgemm_gpu
                            • +
                            • jagged_to_padded_dense() : fbgemm_gpu
                            • +
                            • jagged_to_padded_dense_backward() : fbgemm_gpu
                            • +
                            • jagged_to_padded_dense_backward_meta() : fbgemm_gpu
                            • +
                            • jagged_to_padded_dense_forward() : fbgemm_gpu
                            • +
                            • jagged_to_padded_dense_forward_meta() : fbgemm_gpu
                            • +
                            • jagged_to_padded_dense_meta() : fbgemm_gpu
                            • +
                            • jagged_unique_indices_cuda() : fbgemm_gpu
                            • +
                            +
                            + + + + diff --git a/namespacemembers_k.html b/namespacemembers_k.html new file mode 100644 index 000000000..c6da64df3 --- /dev/null +++ b/namespacemembers_k.html @@ -0,0 +1,89 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - k -

                              +
                            • kCacheLocationMissing : fbgemm_gpu
                            • +
                            • keyed_jagged_index_add_dim1_kernel() : fbgemm_gpu
                            • +
                            • keyed_jagged_index_select_dim1_kernel() : fbgemm_gpu
                            • +
                            • keyed_jagged_index_select_dim_1_gpu() : fbgemm_gpu
                            • +
                            • kRowInitBufferSize : ssd
                            • +
                            +
                            + + + + diff --git a/namespacemembers_l.html b/namespacemembers_l.html new file mode 100644 index 000000000..c0096ef79 --- /dev/null +++ b/namespacemembers_l.html @@ -0,0 +1,108 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - l -

                            +
                            + + + + diff --git a/namespacemembers_m.html b/namespacemembers_m.html new file mode 100644 index 000000000..a8a1d65af --- /dev/null +++ b/namespacemembers_m.html @@ -0,0 +1,95 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - m -

                            +
                            + + + + diff --git a/namespacemembers_n.html b/namespacemembers_n.html new file mode 100644 index 000000000..348363bdf --- /dev/null +++ b/namespacemembers_n.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - n -

                            +
                            + + + + diff --git a/namespacemembers_o.html b/namespacemembers_o.html new file mode 100644 index 000000000..802adcbd6 --- /dev/null +++ b/namespacemembers_o.html @@ -0,0 +1,94 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - o -

                            +
                            + + + + diff --git a/namespacemembers_p.html b/namespacemembers_p.html new file mode 100644 index 000000000..7ff10f2c9 --- /dev/null +++ b/namespacemembers_p.html @@ -0,0 +1,149 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - p -

                            +
                            + + + + diff --git a/namespacemembers_q.html b/namespacemembers_q.html new file mode 100644 index 000000000..305e8672b --- /dev/null +++ b/namespacemembers_q.html @@ -0,0 +1,85 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - q -

                            +
                            + + + + diff --git a/namespacemembers_r.html b/namespacemembers_r.html new file mode 100644 index 000000000..f4b04aa4e --- /dev/null +++ b/namespacemembers_r.html @@ -0,0 +1,105 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - r -

                            +
                            + + + + diff --git a/namespacemembers_s.html b/namespacemembers_s.html new file mode 100644 index 000000000..a8458d675 --- /dev/null +++ b/namespacemembers_s.html @@ -0,0 +1,121 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - s -

                            +
                            + + + + diff --git a/namespacemembers_t.html b/namespacemembers_t.html new file mode 100644 index 000000000..592cf5baa --- /dev/null +++ b/namespacemembers_t.html @@ -0,0 +1,109 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - t -

                            +
                            + + + + diff --git a/namespacemembers_type.html b/namespacemembers_type.html new file mode 100644 index 000000000..058500a27 --- /dev/null +++ b/namespacemembers_type.html @@ -0,0 +1,90 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace typedefs with links to the namespace documentation for each typedef:
                            +
                            + + + + diff --git a/namespacemembers_u.html b/namespacemembers_u.html new file mode 100644 index 000000000..1b1f7e75d --- /dev/null +++ b/namespacemembers_u.html @@ -0,0 +1,98 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - u -

                            +
                            + + + + diff --git a/namespacemembers_v.html b/namespacemembers_v.html new file mode 100644 index 000000000..74c750def --- /dev/null +++ b/namespacemembers_v.html @@ -0,0 +1,91 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - v -

                            +
                            + + + + diff --git a/namespacemembers_vars.html b/namespacemembers_vars.html new file mode 100644 index 000000000..a3fd7f658 --- /dev/null +++ b/namespacemembers_vars.html @@ -0,0 +1,286 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace variables with links to the namespace documentation for each variable:
                            + +

                            - b -

                            + + +

                            - c -

                            + + +

                            - d -

                            + + +

                            - e -

                            + + +

                            - f -

                            + + +

                            - g -

                            + + +

                            - i -

                            + + +

                            - k -

                              +
                            • kCacheLocationMissing : fbgemm_gpu
                            • +
                            • kRowInitBufferSize : ssd
                            • +
                            + + +

                            - l -

                            + + +

                            - m -

                            + + +

                            - n -

                            + + +

                            - o -

                            + + +

                            - p -

                            + + +

                            - r -

                            + + +

                            - s -

                            + + +

                            - t -

                            + + +

                            - u -

                            + + +

                            - v -

                              +
                            • values_data : fbgemm_gpu
                            • +
                            • vec_copy_with_implicit_type_cast< int64_t, int32_t, VEC_WIDTH > : fbgemm_gpu
                            • +
                            + + +

                            - w -

                            +
                            + + + + diff --git a/namespacemembers_w.html b/namespacemembers_w.html new file mode 100644 index 000000000..b8324c8f4 --- /dev/null +++ b/namespacemembers_w.html @@ -0,0 +1,95 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - w -

                            +
                            + + + + diff --git a/namespacemembers_z.html b/namespacemembers_z.html new file mode 100644 index 000000000..2d3da11f8 --- /dev/null +++ b/namespacemembers_z.html @@ -0,0 +1,86 @@ + + + + + + + +fbgemm_gpu: Namespace Members + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + +
                            + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            Here is a list of all namespace members with links to the namespace documentation for each member:
                            + +

                            - z -

                            +
                            + + + + diff --git a/namespacenbit.html b/namespacenbit.html new file mode 100644 index 000000000..c4e0fcef4 --- /dev/null +++ b/namespacenbit.html @@ -0,0 +1,4620 @@ + + + + + + + +fbgemm_gpu: nbit Namespace Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            + +
                            nbit Namespace Reference
                            +
                            +
                            + + + + + +

                            +Functions

                            template<int N>
                            __device__ __forceinline__ void cp_async_wait ()
                             
                            +

                            Function Documentation

                            + +

                            ◆ __launch_bounds__() [1/3]

                            + +
                            +
                            + + + + + + + +
                            template __launch_bounds__ (4 * kWarpSize)
                            +
                            + +
                            +
                            + +

                            ◆ __launch_bounds__() [2/3]

                            + +
                            +
                            + + + + + + + +
                            __global__ __launch_bounds__ (kMaxThreads ) const
                            +
                            + +
                            +
                            + +

                            ◆ __launch_bounds__() [3/3]

                            + +
                            +
                            +
                            +template<typename index_t , typename output_t , size_t OutputRowsPerThread, size_t WarpsPerBlock, size_t InputRowsInFlight, size_t MinNum128BRows, size_t MaxNum128BRows, bool DeviceOnly>
                            + + + + + + + +
                            __launch_bounds__ (WarpsPerBlock * kWarpSize) const
                            +
                            + +
                            +
                            + +

                            ◆ cp_async_fence()

                            + +
                            +
                            + + + + + + + +
                            __device__ __forceinline__ void cp_async_fence ()
                            +
                            + +
                            +
                            + +

                            ◆ cp_async_wait()

                            + +
                            +
                            +
                            +template<int N>
                            + + + + + + + +
                            __device__ __forceinline__ void cp_async_wait ()
                            +
                            + +

                            Partial specialization.

                            +

                            Blocks until all but N previous cp.async.commit_group operations have committed.

                            + +
                            +
                            + +

                            ◆ cp_async_wait< 0 >()

                            + +
                            +
                            +
                            +template<>
                            + + + + + + + +
                            __device__ __forceinline__ void cp_async_wait< 0 > ()
                            +
                            + +

                            Blocks until all previous cp.async.commit_group operations have committed.

                            + +
                            +
                            + +

                            ◆ cp_async_zfill()

                            + +
                            +
                            +
                            +template<int SizeInBytes>
                            + + + + + + + + + + + + + + + + +
                            __device__ __forceinline__ void cp_async_zfill (void * smem_ptr,
                            void const * global_ptr,
                            bool pred_guard )
                            +
                            + +

                            Copy with zero fill.

                            + +
                            +
                            + +

                            ◆ cp_async_zfill_cg()

                            + +
                            +
                            +
                            +template<int SizeInBytes>
                            + + + + + + + + + + + + + + + + +
                            __device__ __forceinline__ void cp_async_zfill_cg (void * smem_ptr,
                            void const * global_ptr,
                            bool pred_guard )
                            +
                            + +

                            Partial specialization.

                            + +
                            +
                            + +

                            ◆ cutlass_get_smem_pointer() [1/2]

                            + +
                            +
                            + + + + + +
                            + + + + + + + +
                            __device__ unsigned cutlass_get_smem_pointer (void * ptr)
                            +
                            +inline
                            +
                            + +

                            CUTLASS helper to get SMEM pointer.

                            + +
                            +
                            + +

                            ◆ cutlass_get_smem_pointer() [2/2]

                            + +
                            +
                            + + + + + +
                            + + + + + + + +
                            __device__ unsigned cutlass_get_smem_pointer (void const * ptr)
                            +
                            +inline
                            +
                            + +

                            CUTLASS helper to get SMEM pointer.

                            + +
                            +
                            + +

                            ◆ div_round_up()

                            + +
                            +
                            + + + + + + + + + + + +
                            C10_HOST_DEVICE C10_ALWAYS_INLINE uint32_t div_round_up (uint32_t a,
                            uint32_t b )
                            +
                            + +
                            +
                            + +

                            ◆ false() [1/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const int64_t D,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t row_alignment,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [2/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const int64_t D,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t row_alignment,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [3/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const int64_t D,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t row_alignment,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [4/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const int64_t D,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t row_alignment,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [5/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const int64_t D,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [6/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const int64_t D,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [7/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const int64_t D,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [8/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const int64_t D,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [9/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [10/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [11/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [12/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [13/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [14/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [15/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [16/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [17/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [18/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [19/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights,
                            pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [20/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights,
                            pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [21/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights,
                            pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [22/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights,
                            pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [23/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ false() [24/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template false (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ padded_D()

                            + +
                            +
                            + + + + + +
                            + + + + + + + + + + + +
                            __device__ int32_t padded_D (const int32_t dim,
                            const fbgemm_gpu::SparseType weight_ty )
                            +
                            +inline
                            +
                            + +
                            +
                            + +

                            ◆ padded_row_size_in_bytes()

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            C10_HOST_DEVICE C10_ALWAYS_INLINE int32_t padded_row_size_in_bytes (int32_t dim,
                            fbgemm_gpu::SparseType weight_ty,
                            int32_t row_alignment )
                            +
                            + +
                            +
                            + +

                            ◆ pruned_hash_function()

                            + +
                            +
                            + + + + + +
                            + + + + + + + +
                            __device__ uint32_t pruned_hash_function (uint32_t h)
                            +
                            +inline
                            +
                            + +
                            +
                            + +

                            ◆ round_up()

                            + +
                            +
                            + + + + + + + + + + + +
                            C10_HOST_DEVICE C10_ALWAYS_INLINE uint32_t round_up (uint32_t a,
                            uint32_t b )
                            +
                            + +
                            +
                            + +

                            ◆ true() [1/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const int64_t D,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t row_alignment,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [2/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const int64_t D,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t row_alignment,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [3/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const int64_t D,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t row_alignment,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [4/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const int64_t D,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t row_alignment,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [5/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const int64_t D,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [6/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const int64_t D,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [7/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const int64_t D,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [8/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const int64_t D,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [9/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [10/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [11/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [12/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [13/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [14/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [15/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [16/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [17/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [18/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights,
                            const int exponent_bits,
                            const int exponent_bias,
                            pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [19/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights,
                            pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [20/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights,
                            pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [21/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights,
                            pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [22/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights,
                            pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [23/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ true() [24/24]

                            + +
                            +
                            + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                            template true (const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights,
                            const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements,
                            const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets,
                            const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets,
                            FixedDivisor fd_B,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets,
                            const int64_t pooling_mode,
                            const int64_t row_alignment,
                            pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output,
                            const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights,
                            const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations )
                            +
                            + +
                            +
                            + +

                            ◆ unpadded_row_size_in_bytes()

                            + +
                            +
                            + + + + + + + + + + + +
                            C10_HOST_DEVICE C10_ALWAYS_INLINE int32_t unpadded_row_size_in_bytes (int32_t dim,
                            fbgemm_gpu::SparseType weight_ty )
                            +
                            + +
                            +
                            +

                            Variable Documentation

                            + +

                            ◆ float

                            + +
                            +
                            + + + + +
                            template float
                            +
                            + +
                            +
                            + +

                            ◆ uint8_t

                            + +
                            +
                            + + + + +
                            template uint8_t
                            +
                            + +
                            +
                            +
                            + + + + diff --git a/namespacessd.html b/namespacessd.html new file mode 100644 index 000000000..8b85ca350 --- /dev/null +++ b/namespacessd.html @@ -0,0 +1,167 @@ + + + + + + + +fbgemm_gpu: ssd Namespace Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + +
                            +
                            + +
                            ssd Namespace Reference
                            +
                            +
                            + + + + + + +

                            +Classes

                            class  EmbeddingRocksDB
                             
                            class  Initializer
                             
                            +

                            Function Documentation

                            + +

                            ◆ db_shard()

                            + +
                            +
                            + + + + + +
                            + + + + + + + + + + + +
                            size_t db_shard (int64_t id,
                            size_t num_shards )
                            +
                            +inline
                            +
                            + +
                            +
                            + +

                            ◆ hostAsynchronousThreadPoolExecutor()

                            + +
                            +
                            + + + + + + + + + + + +
                            void hostAsynchronousThreadPoolExecutor (void(*)(void *) f,
                            void * userData )
                            +
                            + +
                            +
                            +

                            Variable Documentation

                            + +

                            ◆ kRowInitBufferSize

                            + +
                            +
                            + + + + + +
                            + + + + +
                            constexpr size_t kRowInitBufferSize = 32 * 1024
                            +
                            +constexpr
                            +
                            + +
                            +
                            +
                            + + + + diff --git a/objects.inv b/objects.inv index 4f9bb70d7..c6e6b5130 100644 Binary files a/objects.inv and b/objects.inv differ diff --git a/ops__utils_8h.html b/ops__utils_8h.html new file mode 100644 index 000000000..bce03bd1e --- /dev/null +++ b/ops__utils_8h.html @@ -0,0 +1,137 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/ops_utils.h File Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + + +
                            +
                            +
                            ops_utils.h File Reference
                            +
                            +
                            +
                            #include <ATen/ATen.h>
                            +#include <ATen/core/op_registration/op_registration.h>
                            +#include <torch/library.h>
                            +

                            Macro Definition Documentation

                            + +

                            ◆ DLL_PUBLIC

                            + +
                            +
                            + + + + +
                            #define DLL_PUBLIC   __attribute__((visibility("default")))
                            +
                            + +
                            +
                            + +

                            ◆ FBGEMM_OP_DISPATCH

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            #define FBGEMM_OP_DISPATCH( DISPATCH_KEY,
                            EXPORT_NAME,
                            FUNC_NAME )
                            +
                            +Value:
                            TORCH_LIBRARY_IMPL(fbgemm, DISPATCH_KEY, m) { \
                            +
                            m.impl( \
                            +
                            EXPORT_NAME, \
                            +
                            torch::dispatch(c10::DispatchKey::DISPATCH_KEY, TORCH_FN(FUNC_NAME))); \
                            +
                            }
                            +
                            TORCH_LIBRARY_IMPL(fbgemm, Autograd, m)
                            Definition jagged_tensor_ops_autograd.cpp:849
                            +
                            +
                            +
                            +
                            + + + + diff --git a/permute__pooled__embedding__function_8cpp.html b/permute__pooled__embedding__function_8cpp.html new file mode 100644 index 000000000..ba6ebc1ad --- /dev/null +++ b/permute__pooled__embedding__function_8cpp.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_function.cpp File Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + + +
                            +
                            + +
                            permute_pooled_embedding_function.cpp File Reference
                            +
                            +
                            + + + + +

                            +Namespaces

                            namespace  fbgemm_gpu
                             
                            +

                            Typedef Documentation

                            + +

                            ◆ Tensor

                            + +
                            +
                            + + + + +
                            using Tensor = at::Tensor
                            +
                            + +
                            +
                            +
                            + + + + diff --git a/permute__pooled__embedding__ops_8cu.html b/permute__pooled__embedding__ops_8cu.html new file mode 100644 index 000000000..7bbf4d5b5 --- /dev/null +++ b/permute__pooled__embedding__ops_8cu.html @@ -0,0 +1,121 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops.cu File Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + + +
                            +
                            + +
                            permute_pooled_embedding_ops.cu File Reference
                            +
                            +
                            +
                            #include <ATen/AccumulateType.h>
                            +#include <ATen/cuda/CUDAContext.h>
                            +#include <ATen/cuda/Exceptions.h>
                            +#include <c10/cuda/CUDAGuard.h>
                            +#include <cuda.h>
                            +#include <cuda_runtime.h>
                            +#include "fbgemm_gpu/ops_utils.h"
                            +#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
                            +#include "fbgemm_gpu/layout_transform_ops.cuh"
                            +#include "fbgemm_gpu/permute_pooled_embedding_ops.h"
                            +#include "fbgemm_gpu/sparse_ops_utils.h"
                            +
                            + + + +

                            +Namespaces

                            namespace  fbgemm_gpu
                             
                            +

                            Typedef Documentation

                            + +

                            ◆ Tensor

                            + +
                            +
                            + + + + +
                            using Tensor = at::Tensor
                            +
                            + +
                            +
                            +
                            + + + + diff --git a/permute__pooled__embedding__ops_8h.html b/permute__pooled__embedding__ops_8h.html new file mode 100644 index 000000000..8cd156f2d --- /dev/null +++ b/permute__pooled__embedding__ops_8h.html @@ -0,0 +1,106 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops.h File Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + + +
                            +
                            + +
                            permute_pooled_embedding_ops.h File Reference
                            +
                            +
                            +
                            #include <ATen/ATen.h>
                            +#include <torch/csrc/api/include/torch/types.h>
                            +#include <torch/csrc/autograd/custom_function.h>
                            +#include "fbgemm_gpu/ops_utils.h"
                            +#include "fbgemm_gpu/sparse_ops_utils.h"
                            +
                            + + + +

                            +Classes

                            class  PermutePooledEmbsFunction
                             
                            + + + +

                            +Namespaces

                            namespace  fbgemm_gpu
                             
                            +
                            + + + + diff --git a/permute__pooled__embedding__ops__cpu_8cpp.html b/permute__pooled__embedding__ops__cpu_8cpp.html new file mode 100644 index 000000000..ccf4e85ab --- /dev/null +++ b/permute__pooled__embedding__ops__cpu_8cpp.html @@ -0,0 +1,318 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_cpu.cpp File Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + + +
                            +
                            + +
                            permute_pooled_embedding_ops_cpu.cpp File Reference
                            +
                            +
                            +
                            #include <c10/util/irange.h>
                            +#include <vector>
                            +#include "fbgemm_gpu/dispatch_macros.h"
                            +#include "fbgemm_gpu/permute_pooled_embedding_ops.h"
                            +
                            + + + +

                            +Namespaces

                            namespace  fbgemm_gpu
                             
                            +

                            Typedef Documentation

                            + +

                            ◆ Tensor

                            + +
                            +
                            + + + + +
                            using Tensor = at::Tensor
                            +
                            + +
                            +
                            +

                            Function Documentation

                            + +

                            ◆ FBGEMM_OP_DISPATCH() [1/7]

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            FBGEMM_OP_DISPATCH (Autograd ,
                            "permute_pooled_embs_auto_grad" ,
                            fbgemm_gpu::permute_pooled_embs_auto_grad  )
                            +
                            + +
                            +
                            + +

                            ◆ FBGEMM_OP_DISPATCH() [2/7]

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            FBGEMM_OP_DISPATCH (CPU ,
                            "permute_duplicate_pooled_embs" ,
                            fbgemm_gpu::permute_duplicate_pooled_embs_cpu  )
                            +
                            + +
                            +
                            + +

                            ◆ FBGEMM_OP_DISPATCH() [3/7]

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            FBGEMM_OP_DISPATCH (CPU ,
                            "permute_duplicate_pooled_embs_auto_grad" ,
                            fbgemm_gpu::permute_duplicate_pooled_embs_auto_grad_cpu  )
                            +
                            + +
                            +
                            + +

                            ◆ FBGEMM_OP_DISPATCH() [4/7]

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            FBGEMM_OP_DISPATCH (CPU ,
                            "permute_pooled_embs" ,
                            fbgemm_gpu::permute_pooled_embs_cpu  )
                            +
                            + +
                            +
                            + +

                            ◆ FBGEMM_OP_DISPATCH() [5/7]

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            FBGEMM_OP_DISPATCH (CPU ,
                            "permute_pooled_embs_auto_grad" ,
                            fbgemm_gpu::permute_pooled_embs_auto_grad_cpu  )
                            +
                            + +
                            +
                            + +

                            ◆ FBGEMM_OP_DISPATCH() [6/7]

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            FBGEMM_OP_DISPATCH (Meta ,
                            "permute_pooled_embs" ,
                            fbgemm_gpu::permute_pooled_embs_meta  )
                            +
                            + +
                            +
                            + +

                            ◆ FBGEMM_OP_DISPATCH() [7/7]

                            + +
                            +
                            + + + + + + + + + + + + + + + + +
                            FBGEMM_OP_DISPATCH (Meta ,
                            "permute_pooled_embs_auto_grad" ,
                            fbgemm_gpu::permute_pooled_embs_auto_grad_meta  )
                            +
                            + +
                            +
                            + +

                            ◆ TORCH_LIBRARY_FRAGMENT()

                            + +
                            +
                            + + + + + + + + + + + +
                            TORCH_LIBRARY_FRAGMENT (fbgemm ,
                            m  )
                            +
                            + +
                            +
                            +
                            + + + + diff --git a/permute__pooled__embedding__ops__gpu_8cpp.html b/permute__pooled__embedding__ops__gpu_8cpp.html new file mode 100644 index 000000000..93bbb3c17 --- /dev/null +++ b/permute__pooled__embedding__ops__gpu_8cpp.html @@ -0,0 +1,138 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_gpu.cpp File Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + + +
                            +
                            + +
                            permute_pooled_embedding_ops_gpu.cpp File Reference
                            +
                            +
                            +
                            #include <ATen/ATen.h>
                            +#include <ATen/core/op_registration/op_registration.h>
                            +#include <c10/util/irange.h>
                            +#include <torch/script.h>
                            +#include <vector>
                            +#include "fbgemm_gpu/permute_pooled_embedding_ops.h"
                            +
                            + + + +

                            +Namespaces

                            namespace  fbgemm_gpu
                             
                            +

                            Typedef Documentation

                            + +

                            ◆ Tensor

                            + +
                            +
                            + + + + +
                            using Tensor = at::Tensor
                            +
                            + +
                            +
                            +

                            Function Documentation

                            + +

                            ◆ TORCH_LIBRARY_FRAGMENT()

                            + +
                            +
                            + + + + + + + + + + + +
                            TORCH_LIBRARY_FRAGMENT (fbgemm ,
                            m  )
                            +
                            + +
                            +
                            +
                            + + + + diff --git a/permute__pooled__embedding__ops__split_8cu.html b/permute__pooled__embedding__ops__split_8cu.html new file mode 100644 index 000000000..c35a90243 --- /dev/null +++ b/permute__pooled__embedding__ops__split_8cu.html @@ -0,0 +1,120 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split.cu File Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + + +
                            +
                            + +
                            permute_pooled_embedding_ops_split.cu File Reference
                            +
                            +
                            +
                            #include <ATen/AccumulateType.h>
                            +#include <ATen/cuda/CUDAContext.h>
                            +#include <ATen/cuda/Exceptions.h>
                            +#include <c10/cuda/CUDAGuard.h>
                            +#include <cuda.h>
                            +#include <cuda_runtime.h>
                            +#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
                            +#include "fbgemm_gpu/layout_transform_ops.cuh"
                            +#include "fbgemm_gpu/permute_pooled_embedding_ops_split.h"
                            +#include "fbgemm_gpu/sparse_ops_utils.h"
                            +
                            + + + +

                            +Namespaces

                            namespace  fbgemm_gpu
                             
                            +

                            Typedef Documentation

                            + +

                            ◆ Tensor

                            + +
                            +
                            + + + + +
                            using Tensor = at::Tensor
                            +
                            + +
                            +
                            +
                            + + + + diff --git a/permute__pooled__embedding__ops__split_8h.html b/permute__pooled__embedding__ops__split_8h.html new file mode 100644 index 000000000..b35af7ad4 --- /dev/null +++ b/permute__pooled__embedding__ops__split_8h.html @@ -0,0 +1,96 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops_split.h File Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + + +
                            +
                            + +
                            permute_pooled_embedding_ops_split.h File Reference
                            +
                            +
                            +
                            #include <ATen/ATen.h>
                            +
                            + + + +

                            +Namespaces

                            namespace  fbgemm_gpu
                             
                            +
                            + + + + diff --git a/permute__pooled__embedding__ops__split__cpu_8cpp.html b/permute__pooled__embedding__ops__split__cpu_8cpp.html new file mode 100644 index 000000000..332a0de12 --- /dev/null +++ b/permute__pooled__embedding__ops__split__cpu_8cpp.html @@ -0,0 +1,140 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_cpu.cpp File Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + + +
                            +
                            + +
                            permute_pooled_embedding_ops_split_cpu.cpp File Reference
                            +
                            +
                            +
                            #include <ATen/ATen.h>
                            +#include <ATen/core/op_registration/op_registration.h>
                            +#include <c10/util/irange.h>
                            +#include <torch/script.h>
                            +#include <vector>
                            +#include "fbgemm_gpu/permute_pooled_embedding_ops_split.h"
                            +#include "fbgemm_gpu/permute_pooled_embs_function_split.h"
                            +#include "fbgemm_gpu/sparse_ops_utils.h"
                            +
                            + + + +

                            +Namespaces

                            namespace  fbgemm_gpu
                             
                            +

                            Typedef Documentation

                            + +

                            ◆ Tensor

                            + +
                            +
                            + + + + +
                            using Tensor = at::Tensor
                            +
                            + +
                            +
                            +

                            Function Documentation

                            + +

                            ◆ TORCH_LIBRARY_FRAGMENT()

                            + +
                            +
                            + + + + + + + + + + + +
                            TORCH_LIBRARY_FRAGMENT (fbgemm ,
                            m  )
                            +
                            + +
                            +
                            +
                            + + + + diff --git a/permute__pooled__embedding__ops__split__gpu_8cpp.html b/permute__pooled__embedding__ops__split__gpu_8cpp.html new file mode 100644 index 000000000..350a79432 --- /dev/null +++ b/permute__pooled__embedding__ops__split__gpu_8cpp.html @@ -0,0 +1,140 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/permute_pooled_embedding_ops/permute_pooled_embedding_ops_split_gpu.cpp File Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + + +
                            +
                            + +
                            permute_pooled_embedding_ops_split_gpu.cpp File Reference
                            +
                            +
                            +
                            #include <ATen/ATen.h>
                            +#include <ATen/core/op_registration/op_registration.h>
                            +#include <c10/util/irange.h>
                            +#include <torch/script.h>
                            +#include <vector>
                            +#include "fbgemm_gpu/permute_pooled_embedding_ops_split.h"
                            +#include "fbgemm_gpu/permute_pooled_embs_function_split.h"
                            +#include "fbgemm_gpu/sparse_ops_utils.h"
                            +
                            + + + +

                            +Namespaces

                            namespace  fbgemm_gpu
                             
                            +

                            Typedef Documentation

                            + +

                            ◆ Tensor

                            + +
                            +
                            + + + + +
                            using Tensor = at::Tensor
                            +
                            + +
                            +
                            +

                            Function Documentation

                            + +

                            ◆ TORCH_LIBRARY_FRAGMENT()

                            + +
                            +
                            + + + + + + + + + + + +
                            TORCH_LIBRARY_FRAGMENT (fbgemm ,
                            m  )
                            +
                            + +
                            +
                            +
                            + + + + diff --git a/permute__pooled__embs__function_8h.html b/permute__pooled__embs__function_8h.html new file mode 100644 index 000000000..b292ee19c --- /dev/null +++ b/permute__pooled__embs__function_8h.html @@ -0,0 +1,87 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embs_function.h File Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + + +
                            +
                            +
                            permute_pooled_embs_function.h File Reference
                            +
                            +
                            +
                            + + + + diff --git a/permute__pooled__embs__function__split_8h.html b/permute__pooled__embs__function__split_8h.html new file mode 100644 index 000000000..3dda3ab8d --- /dev/null +++ b/permute__pooled__embs__function__split_8h.html @@ -0,0 +1,103 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embs_function_split.h File Reference + + + + + + + + + + + +
                            +
                            + + + + + + +
                            +
                            fbgemm_gpu +
                            +
                            +
                            + + + + + + + + +
                            +
                            + + +
                            +
                            +
                            +
                            +
                            +
                            Loading...
                            +
                            Searching...
                            +
                            No Matches
                            +
                            +
                            +
                            +
                            + + +
                            +
                            + +
                            permute_pooled_embs_function_split.h File Reference
                            +
                            +
                            +
                            #include <ATen/ATen.h>
                            +#include <torch/script.h>
                            +
                            + + + +

                            +Classes

                            class  PermutePooledEmbsFunctionSplit< permute_pooled_embs_op >
                             
                            + + + +

                            +Namespaces

                            namespace  fbgemm_gpu
                             
                            +
                            + + + + diff --git a/py-modindex.html b/py-modindex.html index 1ca1756ae..1a04b70da 100644 --- a/py-modindex.html +++ b/py-modindex.html @@ -27,6 +27,8 @@ + + @@ -256,18 +258,19 @@ -

                            FBGEMM_GPU General Info

                            +

                            FBGEMM_GPU General Info

                            -

                            FBGEMM_GPU Python API

                            +

                            FBGEMM_GPU Python API

                            -

                            FBGEMM_GPU C++ API

                            +

                            FBGEMM_GPU C++ API

                            • Sparse Data Operators
                            • Quantization Operators
                            • @@ -415,11 +418,9 @@

                              Python Module Index

                              - - + - - + diff --git a/python-api/jagged_tensor_ops.html b/python-api/jagged_tensor_ops.html index ce6181ac7..701557093 100644 --- a/python-api/jagged_tensor_ops.html +++ b/python-api/jagged_tensor_ops.html @@ -6,7 +6,7 @@ - + @@ -28,6 +28,8 @@ + + @@ -252,18 +254,19 @@ -

                              FBGEMM_GPU General Info

                              +

                              FBGEMM_GPU General Info

                              -

                              FBGEMM_GPU Python API

                              +

                              FBGEMM_GPU Python API

                              -

                              FBGEMM_GPU C++ API

                              +

                              FBGEMM_GPU C++ API

                              • Sparse Data Operators
                              • Quantization Operators
                              • @@ -351,23 +354,23 @@
                                -

                                Jagged Tensor Operators

                                +

                                Jagged Tensor Operators

                                -
                                -torch.ops.fbgemm.jagged_2d_to_dense(values, x_offsets, max_sequence_length) → Tensor
                                +
                                +torch.ops.fbgemm.jagged_2d_to_dense(values, x_offsets, max_sequence_length) Tensor

                                Converts a jagged tensor, with a 2D values array into a dense tensor, padding with zeros.

                                -
                                Parameters
                                +
                                Parameters:
                                • values (Tensor) – 2D tensor containing the values of the jagged tensor.

                                • x_offsets (Tensor) – 1D tensor containing the starting point of each jagged row in the values tensor.

                                • -
                                • max_sequence_length (int) – Maximum length of any row in the jagged dimension.

                                • +
                                • max_sequence_length (int) – Maximum length of any row in the jagged dimension.

                                -
                                Returns
                                +
                                Returns:

                                The padded dense tensor

                                -
                                Return type
                                +
                                Return type:

                                Tensor

                                @@ -386,22 +389,22 @@
                                -
                                -torch.ops.fbgemm.jagged_1d_to_dense(values, offsets, max_sequence_length, padding_value) -> Tensor)
                                +
                                +torch.ops.fbgemm.jagged_1d_to_dense(values, offsets, max_sequence_length, padding_value) -> Tensor)

                                Converts a jagged tensor, with a 1D values array, into a dense tensor, padding with a specified padding value.

                                -
                                Parameters
                                +
                                Parameters:
                                • values (Tensor) – 1D tensor containing the values of the jagged tensor.

                                • offsets (Tensor) – 1D tensor containing the starting point of each jagged row in the values tensor.

                                • -
                                • max_sequence_length (int) – Maximum length of any row in the jagged dimension.

                                • -
                                • padding_value (int) – Value to set in the empty areas of the dense output, outside of the jagged tensor coverage.

                                • +
                                • max_sequence_length (int) – Maximum length of any row in the jagged dimension.

                                • +
                                • padding_value (int) – Value to set in the empty areas of the dense output, outside of the jagged tensor coverage.

                                -
                                Returns
                                +
                                Returns:

                                the padded dense tensor

                                -
                                Return type
                                +
                                Return type:

                                Tensor

                                @@ -416,21 +419,21 @@
                                -
                                -torch.ops.fbgemm.dense_to_jagged(dense, x_offsets, total_L) -> (Tensor, Tensor[])
                                +
                                +torch.ops.fbgemm.dense_to_jagged(dense, x_offsets, total_L) -> (Tensor, Tensor[])

                                Converts a dense tensor into a jagged tensor, given the desired offsets of the resulting dense tensor.

                                -
                                Parameters
                                +
                                Parameters:
                                • dense (Tensor) – A dense input tensor to be converted

                                • x_offsets (Tensor[]) – A list of jagged offset tensors, one for each jagged dimension.

                                • -
                                • total_L (int, Optional) – Total number of values in the resulting jagged tensor.

                                • +
                                • total_L (int, Optional) – Total number of values in the resulting jagged tensor.

                                -
                                Returns
                                +
                                Returns:

                                Values and offsets of the resulting jagged tensor. Offsets are identital to those that were input.

                                -
                                Return type
                                +
                                Return type:

                                (Tensor, Tensor[])

                                @@ -446,22 +449,22 @@
                                -
                                -torch.ops.fbgemm.jagged_to_padded_dense(values, offsets, max_lengths, padding_value=0) → Tensor
                                +
                                +torch.ops.fbgemm.jagged_to_padded_dense(values, offsets, max_lengths, padding_value=0) Tensor

                                Converts a jagged tensor into a dense tensor, padding with a specified padding value.

                                -
                                Parameters
                                +
                                Parameters:
                                • values (Tensor) – Jagged tensor values

                                • offsets (Tensor[]) – A list of jagged offset tensors, one for each jagged dimension.

                                • -
                                • max_lengths (int[]) – A list with max_length for each jagged dimension.

                                • -
                                • padding_value (float) – Value to set in the empty areas of the dense output, outside of the jagged tensor coverage.

                                • +
                                • max_lengths (int[]) – A list with max_length for each jagged dimension.

                                • +
                                • padding_value (float) – Value to set in the empty areas of the dense output, outside of the jagged tensor coverage.

                                -
                                Returns
                                +
                                Returns:

                                the padded dense tensor

                                -
                                Return type
                                +
                                Return type:

                                Tensor

                                @@ -480,54 +483,54 @@
                                -
                                -torch.ops.fbgemm.jagged_dense_elementwise_add(x_values, x_offsets, y) → Tensor
                                +
                                +torch.ops.fbgemm.jagged_dense_elementwise_add(x_values, x_offsets, y) Tensor

                                Adds a jagged tensor to a dense tensor, resulting in dense tensor. Jagged tensor input will be padded with zeros for the purposes of the addition.

                                -
                                Parameters
                                +
                                Parameters:
                                • x_values (Tensor) – Jagged tensor values

                                • offsets (Tensor[]) – A list of jagged offset tensors, one for each jagged dimension.

                                • y (Tensor) – A dense tensor

                                -
                                Returns
                                +
                                Returns:

                                The sum of jagged input tensor + y

                                -
                                Return type
                                +
                                Return type:

                                Tensor

                                -
                                -torch.ops.fbgemm.jagged_dense_elementwise_add_jagged_output(x_values, x_offsets, y) -> (Tensor, Tensor[])
                                +
                                +torch.ops.fbgemm.jagged_dense_elementwise_add_jagged_output(x_values, x_offsets, y) -> (Tensor, Tensor[])

                                Adds a jagged tensor to a dense tensor and, resulting in a jagged tensor with the same structure as the input jagged tensor.

                                -
                                Parameters
                                +
                                Parameters:
                                • x_values (Tensor) – Jagged tensor values

                                • x_offsets (Tensor[]) – A list of jagged offset tensors, one for each jagged dimension.

                                • y (Tensor) – A dense tensor

                                -
                                Returns
                                +
                                Returns:

                                Values and offsets of the resulting jagged tensor. Offsets are identital to those that were input.

                                -
                                Return type
                                +
                                Return type:

                                (Tensor, Tensor[])

                                -
                                -torch.ops.fbgemm.jagged_dense_dense_elementwise_add_jagged_output(x_values, x_offsets, y_0, y_1) -> (Tensor, Tensor[])
                                +
                                +torch.ops.fbgemm.jagged_dense_dense_elementwise_add_jagged_output(x_values, x_offsets, y_0, y_1) -> (Tensor, Tensor[])

                                Adds a jagged tensor to the sum of two dense tensors, resulting in a jagged tensor with the same structure as the input jagged tensor.

                                -
                                Parameters
                                +
                                Parameters:
                                • x_values (Tensor) – Jagged tensor values

                                • x_offsets (Tensor[]) – A list of jagged offset tensors, one for each jagged dimension.

                                • @@ -535,67 +538,67 @@
                                • y_1 (Tensor) – A dense tensor

                                -
                                Returns
                                +
                                Returns:

                                Values and offsets of the resulting jagged tensor. Offsets are identital to those that were input.

                                -
                                Return type
                                +
                                Return type:

                                (Tensor, Tensor[])

                                -
                                -torch.ops.fbgemm.jagged_dense_elementwise_mul(x_values, x_offsets, y) -> (Tensor, Tensor[])
                                +
                                +torch.ops.fbgemm.jagged_dense_elementwise_mul(x_values, x_offsets, y) -> (Tensor, Tensor[])

                                Elementwise-multiplies a jagged tensor a dense tensor and, resulting in a jagged tensor with the same structure as the input jagged tensor.

                                -
                                Parameters
                                +
                                Parameters:
                                • x_values (Tensor) – Jagged tensor values

                                • x_offsets (Tensor[]) – A list of jagged offset tensors, one for each jagged dimension.

                                • y (Tensor) – A dense tensor

                                -
                                Returns
                                +
                                Returns:

                                Values and offsets of the resulting jagged tensor. Offsets are identital to those that were input.

                                -
                                Return type
                                +
                                Return type:

                                (Tensor, Tensor[])

                                -
                                -torch.ops.fbgemm.batched_dense_vec_jagged_2d_mul(Tensor v, Tensor a_values, Tensor a_offsets) → Tensor
                                +
                                +torch.ops.fbgemm.batched_dense_vec_jagged_2d_mul(Tensor v, Tensor a_values, Tensor a_offsets) Tensor

                                Batched vector matrix multiplication of a batched dense vector with a jagged tensor, dense vector is in size (B * H, max_N) and jagged tensor is in size (B, max_N, H * D) where max_N is the maximum size of jagged dimension. B * H is the batch size and each multiplies is max_N with [max_N, D]

                                -
                                Parameters
                                +
                                Parameters:
                                • v (Tensor) – dense vector tensor

                                • a_values (Tensor) – Jagged tensor values

                                • a_offsets (Tensor []) – A list of jagged offset tensors, one for each jagged dimension.

                                -
                                Returns
                                +
                                Returns:

                                output of batch matmul in size (B * H, D)

                                -
                                Return type
                                +
                                Return type:

                                Tensor

                                -
                                -torch.ops.fbgemm.stacked_jagged_1d_to_dense(*args, **kwargs)
                                +
                                +torch.ops.fbgemm.stacked_jagged_1d_to_dense(*args, **kwargs)
                                -
                                -torch.ops.fbgemm.stacked_jagged_2d_to_dense(*args, **kwargs)
                                +
                                +torch.ops.fbgemm.stacked_jagged_2d_to_dense(*args, **kwargs)
                                @@ -643,7 +646,20 @@
                                @@ -659,11 +675,9 @@ - - + - - + diff --git a/python-api/table_batched_embedding_ops.html b/python-api/table_batched_embedding_ops.html index c3f02b7c2..1bf9430c9 100644 --- a/python-api/table_batched_embedding_ops.html +++ b/python-api/table_batched_embedding_ops.html @@ -6,7 +6,7 @@ - + @@ -28,10 +28,12 @@ + + - + - - + - - + diff --git a/pytorch-sphinx-theme/CODE_OF_CONDUCT.html b/pytorch-sphinx-theme/CODE_OF_CONDUCT.html deleted file mode 100644 index f5a4c2aa1..000000000 --- a/pytorch-sphinx-theme/CODE_OF_CONDUCT.html +++ /dev/null @@ -1,757 +0,0 @@ - - - - - - - - - - - - - Code of Conduct — fbgemm 0.1.2 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                                -
                                -
                                - - - - - - - - - - - - - - - - -
                                - - - - -
                                -
                                - -
                                - Shortcuts -
                                -
                                - -
                                -
                                - - - - - - -
                                - -
                                -
                                - -
                                -

                                Code of Conduct

                                -
                                -

                                Our Pledge

                                -

                                In the interest of fostering an open and welcoming environment, we as -contributors and maintainers pledge to make participation in our project and -our community a harassment-free experience for everyone, regardless of age, body -size, disability, ethnicity, sex characteristics, gender identity and expression, -level of experience, education, socio-economic status, nationality, personal -appearance, race, religion, or sexual identity and orientation.

                                -
                                -
                                -

                                Our Standards

                                -

                                Examples of behavior that contributes to creating a positive environment -include:

                                -
                                  -
                                • Using welcoming and inclusive language

                                • -
                                • Being respectful of differing viewpoints and experiences

                                • -
                                • Gracefully accepting constructive criticism

                                • -
                                • Focusing on what is best for the community

                                • -
                                • Showing empathy towards other community members

                                • -
                                -

                                Examples of unacceptable behavior by participants include:

                                -
                                  -
                                • The use of sexualized language or imagery and unwelcome sexual attention or -advances

                                • -
                                • Trolling, insulting/derogatory comments, and personal or political attacks

                                • -
                                • Public or private harassment

                                • -
                                • Publishing others’ private information, such as a physical or electronic -address, without explicit permission

                                • -
                                • Other conduct which could reasonably be considered inappropriate in a -professional setting

                                • -
                                -
                                -
                                -

                                Our Responsibilities

                                -

                                Project maintainers are responsible for clarifying the standards of acceptable -behavior and are expected to take appropriate and fair corrective action in -response to any instances of unacceptable behavior.

                                -

                                Project maintainers have the right and responsibility to remove, edit, or -reject comments, commits, code, wiki edits, issues, and other contributions -that are not aligned to this Code of Conduct, or to ban temporarily or -permanently any contributor for other behaviors that they deem inappropriate, -threatening, offensive, or harmful.

                                -
                                -
                                -

                                Scope

                                -

                                This Code of Conduct applies within all project spaces, and it also applies when -an individual is representing the project or its community in public spaces. -Examples of representing a project or community include using an official -project e-mail address, posting via an official social media account, or acting -as an appointed representative at an online or offline event. Representation of -a project may be further defined and clarified by project maintainers.

                                -
                                -
                                -

                                Enforcement

                                -

                                Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported by contacting the project team at opensource-conduct@fb.com. All -complaints will be reviewed and investigated and will result in a response that -is deemed necessary and appropriate to the circumstances. The project team is -obligated to maintain confidentiality with regard to the reporter of an incident. -Further details of specific enforcement policies may be posted separately.

                                -

                                Project maintainers who do not follow or enforce the Code of Conduct in good -faith may face temporary or permanent repercussions as determined by other -members of the project’s leadership.

                                -
                                -
                                -

                                Attribution

                                -

                                This Code of Conduct is adapted from the Contributor Covenant, version 1.4, -available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html

                                -

                                For answers to common questions about this code of conduct, see -https://www.contributor-covenant.org/faq

                                -
                                -
                                - - -
                                - -
                                -
                                - - - - -
                                - - - -
                                -

                                - © Copyright 2023, FBGEMM Team. - -

                                -
                                - -
                                - Built with Sphinx using a theme provided by Read the Docs. -
                                - - -
                                - -
                                -
                                - - -
                                -
                                - - - - - - - - - - - - - - - - - - - - - - - - - - -
                                -
                                -
                                -
                                -

                                Docs

                                -

                                Access comprehensive developer documentation for PyTorch

                                - View Docs -
                                - -
                                -

                                Tutorials

                                -

                                Get in-depth tutorials for beginners and advanced developers

                                - View Tutorials -
                                - -
                                -

                                Resources

                                -

                                Find development resources and get your questions answered

                                - View Resources -
                                -
                                -
                                -
                                - - - - - - - - - -
                                -
                                -
                                -
                                - - -
                                -
                                -
                                - - -
                                - - - - - - - - \ No newline at end of file diff --git a/pytorch-sphinx-theme/CONTRIBUTING.html b/pytorch-sphinx-theme/CONTRIBUTING.html deleted file mode 100644 index 605db1d24..000000000 --- a/pytorch-sphinx-theme/CONTRIBUTING.html +++ /dev/null @@ -1,717 +0,0 @@ - - - - - - - - - - - - - Contributing to pytorch_sphinx_theme — fbgemm 0.1.2 documentation - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
                                -
                                -
                                - - - - - - - - - - - - - - - - -
                                - -
                                  - -
                                • - - - Docs - - > -
                                • - - -
                                • Contributing to pytorch_sphinx_theme
                                • - - -
                                • - - - - - -
                                • - -
                                - - -
                                -
                                - -
                                - Shortcuts -
                                -
                                - -
                                -
                                - - - - - - -
                                - -
                                -
                                - -
                                -

                                Contributing to pytorch_sphinx_theme

                                -

                                We want to make contributing to this project as easy and transparent as -possible.

                                -
                                -

                                Pull Requests

                                -

                                We actively welcome your pull requests.

                                -
                                  -
                                1. Fork the repo and create your branch from master.

                                2. -
                                3. If you’ve added code that should be tested, add tests.

                                4. -
                                5. If you’ve changed APIs, update the documentation.

                                6. -
                                7. Ensure the test suite passes.

                                8. -
                                9. Make sure your code lints.

                                10. -
                                11. If you haven’t already, complete the Contributor License Agreement (“CLA”).

                                12. -
                                -
                                -
                                -

                                Contributor License Agreement (“CLA”)

                                -

                                In order to accept your pull request, we need you to submit a CLA. You only need -to do this once to work on any of Facebook’s open source projects.

                                -

                                Complete your CLA here: https://code.facebook.com/cla

                                -
                                -
                                -

                                Issues

                                -

                                We use GitHub issues to track public bugs. Please ensure your description is -clear and has sufficient instructions to be able to reproduce the issue.

                                -

                                Facebook has a bounty program for the safe -disclosure of security bugs. In those cases, please go through the process -outlined on that page and do not file a public issue.

                                -
                                -
                                -

                                License

                                -

                                By contributing to pytorch_sphinx_theme, you agree that your contributions will be licensed -under the LICENSE file in the root directory of this source tree.

                                -
                                -
                                - - -
                                - -
                                -
                                - - - - -
                                - - - -
                                -

                                - © Copyright 2023, FBGEMM Team. - -

                                -
                                - -
                                - Built with Sphinx using a theme provided by Read the Docs. -
                                - - -
                                - -
                                -
                                - - -
                                -
                                - - - - - - - - - - - - - - - - - - - - - - - - - - -
                                -
                                -
                                -
                                -

                                Docs

                                -

                                Access comprehensive developer documentation for PyTorch

                                - View Docs -
                                - -
                                -

                                Tutorials

                                -

                                Get in-depth tutorials for beginners and advanced developers

                                - View Tutorials -
                                - -
                                -

                                Resources

                                -

                                Find development resources and get your questions answered

                                - View Resources -
                                -
                                -
                                -
                                - - - - - - - - - -
                                -
                                -
                                -
                                - - -
                                -
                                -
                                - - -
                                - - - - - - - - \ No newline at end of file diff --git a/pytorch-sphinx-theme/docs/changelog.html b/pytorch-sphinx-theme/docs/changelog.html index 439ee040a..5fd14c339 100644 --- a/pytorch-sphinx-theme/docs/changelog.html +++ b/pytorch-sphinx-theme/docs/changelog.html @@ -6,7 +6,7 @@ - + @@ -28,6 +28,8 @@ + + @@ -250,18 +252,19 @@ -

                                FBGEMM_GPU General Info

                                +

                                FBGEMM_GPU General Info

                                -

                                FBGEMM_GPU Python API

                                +

                                FBGEMM_GPU Python API

                                -

                                FBGEMM_GPU C++ API

                                +

                                FBGEMM_GPU C++ API

                                +
                                -

                                3.1. Inline Markup

                                +

                                3.1. Inline Markup

                                Paragraphs contain text and may contain inline markup: emphasis, strong emphasis, inline literals, -standalone hyperlinks (http://www.python.org), external hyperlinks (Python 5), internal cross-references (example), +standalone hyperlinks (http://www.python.org), external hyperlinks (Python [5]), internal cross-references (example), external hyperlinks with embedded URIs (Python web site), footnote references -(manually numbered 1, anonymous auto-numbered 3, labeled auto-numbered 2, or symbolic *), -citation references (12), substitution references (EXAMPLE), and inline hyperlink targets +(manually numbered [1], anonymous auto-numbered [3], labeled auto-numbered [2], or symbolic [*]), +citation references ([12]), substitution references (EXAMPLE), and inline hyperlink targets (see Targets below for a reference back to here). Character-level inline markup is also possible (although exceedingly ugly!) in reStructuredText. Problems are indicated by |problematic| text (generated by processing errors; this one is intentional).

                                Also with sphinx.ext.autodoc, which I use in the demo, I can link to test_py_module.test.Foo. It will link you right my code documentation for it.

                                The default role for interpreted text is Title Reference. Here are some explicit interpreted text roles: -a PEP reference (PEP 287); an RFC reference (RFC 2822); a subscript; a superscript; +a PEP reference (PEP 287); an RFC reference (RFC 2822); a subscript; a superscript; and explicit roles for standard inline markup.

                                GUI labels are a useful way to indicate that Some action is to be taken by the user. The GUI label should not run over line-height so as not to interfere with text from adjacent lines.

                                Key-bindings indicate that the read is to press a button on the keyboard or mouse, -for example MMB and Shift-MMB. Another useful markup to indicate a user action +for example MMB and Shift-MMB. Another useful markup to indicate a user action is to use menuselection this can be used to show short and long menus in software. For example, and menuselection can be seen here that breaks is too long to fit on this line. My ‣ Software ‣ Some menu ‣ Some sub menu 1 ‣ sub menu 2.

                                @@ -439,7 +442,7 @@

                                3.1. If the --pep-references option was supplied, there should be a live link to PEP 258 here.

                                -

                                3.2. Math

                                +

                                3.2. Math

                                This is a test. Here is an equation: \(X_{0:5} = (X_0, X_1, X_2, X_3, X_4)\). Here is another:

                                @@ -453,12 +456,12 @@

                                3.2. You can add a link to equations like the one above (1) by using :eq:.

                                -

                                3.3. Meta

                                +

                                3.3. Meta

                                -

                                3.4. Blocks

                                +

                                3.4. Blocks

                                -

                                3.4.1. Literal Blocks

                                +

                                3.4.1. Literal Blocks

                                Literal blocks are indicated with a double-colon (“::”) at the end of the preceding paragraph (over there -->). They can be indented:

                                if literal_block:
                                @@ -475,7 +478,7 @@ 

                                3.4.1.

                                -

                                3.4.2. Line Blocks

                                +

                                3.4.2. Line Blocks

                                This is a line block. It ends with a blank line.
                                @@ -524,7 +527,7 @@

                                3.4.2.

                                -

                                3.4.3. Block Quotes

                                +

                                3.4.3. Block Quotes

                                Block quotes consist of indented body elements:

                                My theory by A. Elk. Brackets Miss, brackets. This theory goes @@ -536,7 +539,7 @@

                                3.4.3.

                                -

                                3.4.4. Doctest Blocks

                                +

                                3.4.4. Doctest Blocks

                                >>> print 'Python-specific usage examples; begun with ">>>"'
                                 Python-specific usage examples; begun with ">>>"
                                 >>> print '(cut and pasted from interactive Python sessions)'
                                @@ -545,7 +548,7 @@ 

                                3.4.4.

                                -

                                3.4.5. Code Blocks

                                +

                                3.4.5. Code Blocks

                                # parsed-literal test
                                 curl -O http://someurl/release-.tar-gz
                                @@ -574,23 +577,19 @@

                                3.4.5.

                                -

                                3.4.5.1. Emphasized lines with line numbers

                                -
                                1
                                -2
                                -3
                                -4
                                -5
                                def some_function():
                                -    interesting = False
                                -    print 'This line is highlighted.'
                                -    print 'This one is not...'
                                -    print '...but this one is.'
                                -
                                +

                                3.4.5.1. Emphasized lines with line numbers

                                +
                                1def some_function():
                                +2    interesting = False
                                +3    print 'This line is highlighted.'
                                +4    print 'This one is not...'
                                +5    print '...but this one is.'
                                +
                                -

                                3.6. References

                                +

                                3.6. References

                                -

                                3.6.1. Footnotes

                                -
                                -
                                1(1,2)
                                -

                                A footnote contains body elements, consistently indented by at +

                                3.6.1. Footnotes

                                +
                                -
                                2(1,2)
                                -

                                Footnotes may be numbered, either manually (as in 1) or + +

                                -
                                3
                                -

                                This footnote is numbered automatically and anonymously using a + +

                                -
                                *
                                -

                                Footnotes may also use symbols, specified with a “*” label. -Here’s a reference to the next footnote: .

                                -
                                -
                                -

                                This footnote shows the next symbol in the sequence.

                                -
                                -
                                4
                                -

                                Here’s an unreferenced footnote, with a reference to a + +

                                + +
                                -
                                + +
                                -

                                3.6.2. Citations

                                -
                                -
                                11
                                -

                                This is the citation I made, let’s make this extremely long so that we can tell that it doesn’t follow the normal responsive table stuff.

                                -
                                -
                                12(1,2)
                                -

                                This citation has some code blocks in it, maybe some bold and -italics too. Heck, lets put a link to a meta citation 13 too.

                                -
                                -
                                13
                                -

                                This citation will have two backlinks.

                                -
                                -
                                -

                                Here’s a reference to the above, 12, and a [nonexistent] citation.

                                +

                                3.6.2. Citations

                                + +

                                Here’s a reference to the above, [12], and a [nonexistent] citation.

                                Here is another type of citation: citation

                                -

                                3.6.3. Glossary

                                +

                                3.6.3. Glossary

                                This is a glossary with definition terms for thing like Writing:

                                -
                                -
                                Documentation

                                Provides users with the knowledge they need to use something.

                                +
                                +
                                Documentation

                                Provides users with the knowledge they need to use something.

                                -
                                Reading

                                The process of taking information into ones mind through the use of eyes.

                                +
                                Reading

                                The process of taking information into ones mind through the use of eyes.

                                -
                                Writing

                                The process of putting thoughts into a medium for other people to read.

                                +
                                Writing

                                The process of putting thoughts into a medium for other people to read.

                                -

                                3.6.4. Targets

                                +

                                3.6.4. Targets

                                This paragraph is pointed to by the explicit “example” target. A reference can be found under Inline Markup, above. Inline hyperlink targets are also possible.

                                Section headers are implicit targets, referred to by name. See Targets, which is a subsection of `Body Elements`_.

                                -

                                Explicit external targets are interpolated into references such as “Python 5”.

                                +

                                Explicit external targets are interpolated into references such as “Python [5]”.

                                Targets may be indirect and anonymous. Thus this phrase may also refer to the Targets section.

                                Here’s a `hyperlink reference without a target`_, which generates an error.

                                -

                                3.7. Directives

                                +

                                3.7. Directives

                                -

                                3.7.1. Contents

                                +

                                3.7.1. Contents

                                These are just a sample of the many reStructuredText Directives. For others, please see: http://docutils.sourceforge.net/docs/ref/rst/directives.html.

                                -

                                3.7.2. Centered text

                                +

                                3.7.2. Centered text

                                You can create a statement with centered text with .. centered::

                                This is centered text!

                                -

                                3.7.3. Images & Figures

                                +

                                3.7.3. Images & Figures

                                -

                                3.7.3.1. Images

                                +

                                3.7.3.1. Images

                                An image directive (also clickable – a hyperlink reference):

                                ../../../_images/yi_jing_01_chien.jpg
                                -

                                3.7.3.2. Figures

                                +

                                3.7.3.2. Figures

                                reStructuredText, the markup syntax

                                A figure is an image with a caption and/or a legend:

                                ---- @@ -832,7 +800,7 @@

                                3.7.3.2.
                                -

                                3.7.4. Admonitions

                                +

                                3.7.4. Admonitions

                                Attention

                                Directives at large.

                                @@ -882,9 +850,6 @@

                                3.7.4. Tip

                                15% if the service is good.

                                re

                                Revised, revisited, based on ‘re’ module.

                                --- @@ -910,7 +875,7 @@

                                3.7.4.
                                -

                                3.7.5. Topics, Sidebars, and Rubrics

                                +

                                3.7.5. Topics, Sidebars, and Rubrics

                                -
                                +
                                +

                                This is a rubric

                                -

                                3.7.6. Target Footnotes

                                -
                                -
                                5(1,2,3)
                                -

                                http://www.python.org/

                                -
                                -
                                +

                                3.7.6. Target Footnotes

                                +
                                -

                                3.7.7. Replacement Text

                                -

                                I recommend you try Python, the best language around 5.

                                +

                                3.7.7. Replacement Text

                                +

                                I recommend you try Python, the best language around [5].

                                -

                                3.7.8. Compound Paragraph

                                +

                                3.7.8. Compound Paragraph

                                -

                                This paragraph contains a literal block:

                                -
                                Connecting... OK
                                +

                                This paragraph contains a literal block:

                                +
                                Connecting... OK
                                 Transmitting data... OK
                                 Disconnecting... OK
                                 
                                -

                                and thus consists of a simple paragraph, a literal block, and +

                                and thus consists of a simple paragraph, a literal block, and another simple paragraph. Nonetheless it is semantically one paragraph.

                                @@ -956,7 +923,7 @@

                                3.7.8.

                                @@ -1054,11 +1021,10 @@

                                3.8. - - + - - + + diff --git a/pytorch-sphinx-theme/docs/demo/lists_tables.html b/pytorch-sphinx-theme/docs/demo/lists_tables.html index 90c8bb82d..e312df374 100644 --- a/pytorch-sphinx-theme/docs/demo/lists_tables.html +++ b/pytorch-sphinx-theme/docs/demo/lists_tables.html @@ -6,7 +6,7 @@ - + @@ -28,6 +28,8 @@ + + @@ -250,18 +252,19 @@ -

                                FBGEMM_GPU General Info

                                +

                                FBGEMM_GPU General Info

                                -

                                FBGEMM_GPU Python API

                                +

                                FBGEMM_GPU Python API

                                -

                                FBGEMM_GPU C++ API

                                +

                                FBGEMM_GPU C++ API

                                - +
                                -

                                4.1. Lists

                                +

                                4.1. Lists

                                -

                                4.1.1. Enumerated Lists

                                +

                                4.1.1. Enumerated Lists

                                1. Arabic numerals.

                                    @@ -428,7 +431,7 @@

                                    4.1.1.

                                -

                                4.1.2. Definition Lists

                                +

                                4.1.2. Definition Lists

                                Term

                                Definition

                                @@ -440,7 +443,7 @@

                                4.1.2.

                                -

                                4.1.3. Option Lists

                                +

                                4.1.3. Option Lists

                                For listing command-line options:

                                -a
                                @@ -478,38 +481,38 @@

                                4.1.3. There must be at least two spaces between the option and the description.

                                -

                                4.1.4. Field list

                                +

                                4.1.4. Field list

                                -
                                Author
                                +
                                Author:

                                David Goodger

                                -
                                Address
                                +
                                Address:

                                123 Example Street Example, EX Canada A1B 2C3

                                -
                                Contact
                                +
                                Contact:

                                docutils-develop@lists.sourceforge.net

                                -
                                Authors
                                +
                                Authors:

                                Me; Myself; I

                                -
                                organization
                                +
                                organization:

                                humankind

                                -
                                date
                                +
                                date:

                                $Date: 2012-01-03 19:23:53 +0000 (Tue, 03 Jan 2012) $

                                -
                                status
                                +
                                status:

                                This is a “work in progress”

                                -
                                revision
                                +
                                revision:

                                $Revision: 7302 $

                                -
                                version
                                +
                                version:

                                1

                                -
                                copyright
                                +
                                copyright:

                                This document has been placed in the public domain. You may do with it as you wish. You may copy, modify, redistribute, reattribute, sell, buy, rent, lease, @@ -518,17 +521,17 @@

                                4.1.4.

                                -
                                field name
                                +
                                field name:

                                This is a generic bibliographic field.

                                -
                                field name 2
                                +
                                field name 2:

                                Generic bibliographic fields may contain multiple body elements.

                                Like this.

                                -
                                Dedication
                                +
                                Dedication:

                                For Docutils users & co-developers.

                                -
                                abstract
                                +
                                abstract:

                                This document is a demonstration of the reStructuredText markup language, containing examples of all basic reStructuredText constructs and many advanced constructs.

                                @@ -536,7 +539,7 @@

                                4.1.4.

                                -

                                4.1.5. Bullet Lists

                                +

                                4.1.5. Bullet Lists

                                Example

                                 1
                                - 2
                                - 3
                                - 4
                                - 5
                                - 6
                                - 7
                                - 8
                                - 9
                                -10
                                # -*- coding: utf-8 -*-
                                -"""Test Module for sphinx_rtd_theme."""
                                -
                                -
                                -class Foo:
                                -
                                -    """Docstring for class Foo.
                                -
                                -    This text tests for the formatting of docstrings generated from output
                                -    ``sphinx.ext.autodoc``. Which contain reST, but sphinx nests it in the
                                -
                                +
                                 1# -*- coding: utf-8 -*-
                                + 2"""Test Module for sphinx_rtd_theme."""
                                + 3
                                + 4
                                + 5class Foo:
                                + 6
                                + 7    """Docstring for class Foo.
                                + 8
                                + 9    This text tests for the formatting of docstrings generated from output
                                +10    ``sphinx.ext.autodoc``. Which contain reST, but sphinx nests it in the
                                +
                              @@ -605,7 +599,7 @@

                              4.1.5.1.

                              and hehe

                            -
                            4.1.5.1.1. But deeper down the rabbit hole
                            +
                            4.1.5.1.1. But deeper down the rabbit hole
                            • I kept saying that, “deeper down the rabbit hole”. yahoo

                      -

                      4.1.6. Hlists

                      +

                      4.1.6. Hlists

                      • First item

                      • Second item

                      • @@ -663,7 +657,7 @@

                        4.1.6.

                      -

                      4.1.7. Numbered List

                      +

                      4.1.7. Numbered List

                      1. One,

                      2. Two.

                      3. @@ -687,17 +681,11 @@

                        4.1.7.

                      -

                      4.2. Tables

                      +

                      4.2. Tables

                      -

                      4.2.1. Grid Tables

                      +

                      4.2.1. Grid Tables

                      Here’s a grid table followed by a simple table:

                      ------ @@ -735,11 +723,6 @@

                      4.2.1.

                      Header row, column 1 (header rows optional)

                      ----- @@ -769,22 +752,8 @@

                      4.2.1.

                      Inputs

                      Output

                      -

                      4.2.1.1. Giant Tables

                      +

                      4.2.1.1. Giant Tables

                      -------------- @@ -858,14 +827,14 @@

                      4.2.1.1.
                      -

                      4.2.2. List Tables

                      -

                      Header 1

                      Header 2

                      +

                      4.2.2. List Tables

                      +
                      ----++++ @@ -894,10 +863,6 @@

                      4.2.2.

                      List tables can have captions like this one.

                      List table

                      ----
                      This is a list table with images in it.
                      ../../../_images/yi_jing_01_chien.jpg @@ -997,11 +962,9 @@

                      4.2.2. - - + - - + diff --git a/pytorch-sphinx-theme/docs/demo/long.html b/pytorch-sphinx-theme/docs/demo/long.html index 6da90caf1..52b3e614d 100644 --- a/pytorch-sphinx-theme/docs/demo/long.html +++ b/pytorch-sphinx-theme/docs/demo/long.html @@ -6,7 +6,7 @@ - + @@ -28,6 +28,8 @@ + + @@ -250,18 +252,19 @@ -

                      FBGEMM_GPU General Info

                      +

                      FBGEMM_GPU General Info

                      -

                      FBGEMM_GPU Python API

                      +

                      FBGEMM_GPU Python API

                      -

                      FBGEMM_GPU C++ API

                      +

                      FBGEMM_GPU C++ API

                      - +

                      This section demonstrates how the ‘sticky_navigation’ setting behaves when the menu is very long. When this section is selected, it will make the menu and the main area scroll when you are at the top of the page.

                      -

                      1.1. Example Menu 1

                      +

                      1.1. Example Menu 1

                      Just a place holder…

                      -

                      1.2. Example Menu 2

                      +

                      1.2. Example Menu 2

                      Just a place holder…

                      -

                      1.3. Example Menu 3

                      +

                      1.3. Example Menu 3

                      Just a place holder…

                      -

                      1.4. Example Menu 4

                      +

                      1.4. Example Menu 4

                      Just a place holder…

                      -

                      1.5. Example Menu 5

                      +

                      1.5. Example Menu 5

                      Just a place holder…

                      -

                      1.6. Example Menu 6

                      +

                      1.6. Example Menu 6

                      Just a place holder…

                      -

                      1.7. Example Menu 7

                      +

                      1.7. Example Menu 7

                      Just a place holder…

                      -

                      1.8. Example Menu 8

                      +

                      1.8. Example Menu 8

                      Just a place holder…

                      -

                      1.9. Example Menu 9

                      +

                      1.9. Example Menu 9

                      Just a place holder…

                      -

                      1.10. Example Menu 10

                      +

                      1.10. Example Menu 10

                      Just a place holder…

                      -

                      1.11. Example Menu 11

                      +

                      1.11. Example Menu 11

                      Just a place holder…

                      -

                      1.12. Example Menu 12

                      +

                      1.12. Example Menu 12

                      Just a place holder…

                      -

                      1.13. Example Menu 13

                      +

                      1.13. Example Menu 13

                      Just a place holder…

                      -

                      1.14. Example Menu 14

                      +

                      1.14. Example Menu 14

                      Just a place holder…

                      -

                      1.15. Example Menu 15

                      +

                      1.15. Example Menu 15

                      Just a place holder…

                      -

                      1.16. Example Menu 16

                      +

                      1.16. Example Menu 16

                      Just a place holder…

                      -

                      1.17. Example Menu 17

                      +

                      1.17. Example Menu 17

                      Just a place holder…

                      -

                      1.18. Example Menu 18

                      +

                      1.18. Example Menu 18

                      Just a place holder…

                      -

                      1.19. Example Menu 19

                      +

                      1.19. Example Menu 19

                      Just a place holder…

                      -

                      1.20. Example Menu 20

                      +

                      1.20. Example Menu 20

                      Just a place holder…

                      -

                      1.21. Example Submenu 1

                      +

                      1.21. Example Submenu 1

                      Just a place holder…

                      -

                      1.22. Example Submenu 2

                      +

                      1.22. Example Submenu 2

                      Just a place holder…

                      -

                      1.22.1. Submenu 1

                      +

                      1.22.1. Submenu 1

                      Just a place holder…

                      -

                      1.22.1.1. Subsubmenu 1

                      +

                      1.22.1.1. Subsubmenu 1

                      Just a place holder…

                      -

                      1.22.2. Submenu 2

                      +

                      1.22.2. Submenu 2

                      Just a place holder…

                      -

                      1.22.2.1. Subsubmenu 1

                      +

                      1.22.2.1. Subsubmenu 1

                      Just a place holder…

                      -

                      1.22.3. Submenu 3

                      +

                      1.22.3. Submenu 3

                      Just a place holder…

                      -

                      1.22.4. Submenu 4

                      +

                      1.22.4. Submenu 4

                      Just a place holder…

                      -

                      1.22.5. Submenu 5

                      +

                      1.22.5. Submenu 5

                      Just a place holder…

                      @@ -667,11 +670,9 @@

                      1.22.5. - - + - - + diff --git a/pytorch-sphinx-theme/docs/demo/structure.html b/pytorch-sphinx-theme/docs/demo/structure.html index b2cbd466d..c6c783f0f 100644 --- a/pytorch-sphinx-theme/docs/demo/structure.html +++ b/pytorch-sphinx-theme/docs/demo/structure.html @@ -6,7 +6,7 @@ - + @@ -28,6 +28,8 @@ + + @@ -250,18 +252,19 @@ -

                      FBGEMM_GPU General Info

                      +

                      FBGEMM_GPU General Info

                      -

                      FBGEMM_GPU Python API

                      +

                      FBGEMM_GPU Python API

                      -

                      FBGEMM_GPU C++ API

                      +

                      FBGEMM_GPU C++ API

                      - +

                      Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec lorem neque, interdum in ipsum nec, finibus dictum velit. Ut eu efficitur arcu, id aliquam erat. In sit amet diam gravida, imperdiet tellus eu, gravida nisl. Praesent aliquet odio eget libero elementum, quis rhoncus tellus tincidunt. @@ -393,7 +396,7 @@

                      1. St Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Pellentesque dictum dui sem, non placerat tortor rhoncus in. Sed placerat nulla at rhoncus iaculis.

                      -

                      1.1. Document Section

                      +

                      1.1. Document Section

                      Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed condimentum nulla vel neque venenatis, nec placerat lorem placerat. Cras purus eros, gravida vitae tincidunt id, vehicula nec nulla. Fusce aliquet auctor cursus. Phasellus ex neque, vestibulum non est vitae, viverra fringilla tortor. @@ -401,7 +404,7 @@

                      1.1. Aliquam erat volutpat. Maecenas eget dictum mauris. Suspendisse arcu eros, condimentum eget risus sed, luctus efficitur arcu. Cras ut dictum mi. Nulla congue interdum lorem, semper semper enim commodo nec.

                      -

                      1.1.1. Document Subsection

                      +

                      1.1.1. Document Subsection

                      Lorem ipsum dolor sit amet, consectetur adipiscing elit. Etiam efficitur in eros et blandit. Nunc maximus, nisl at auctor vestibulum, justo ex sollicitudin ligula, id faucibus urna orci tristique nisl. Duis auctor rutrum orci, in ornare lacus condimentum quis. Quisque arcu velit, facilisis quis interdum ac, @@ -411,7 +414,7 @@

                      1.1.1.
                      -

                      1.1.1.1. Document Subsubsection

                      +

                      1.1.1.1. Document Subsubsection

                      Donec non rutrum lorem. Aenean sagittis metus at pharetra fringilla. Nunc sapien dolor, cursus sed nisi at, pretium tristique lectus. Sed pellentesque leo lectus, et convallis ipsum euismod a. Integer at leo vitae felis pretium aliquam fringilla quis odio. Sed pharetra enim accumsan feugiat pretium. @@ -419,7 +422,7 @@

                      1.1.1.1.
                      -
                      1.1.1.1.1. Document Paragraph
                      +
                      1.1.1.1.1. Document Paragraph

                      Pellentesque nec est in odio ultrices elementum. Vestibulum et hendrerit sapien, quis vulputate turpis. Suspendisse potenti. Curabitur tristique sit amet lectus non viverra. Phasellus rutrum dapibus turpis sed imperdiet. Mauris maximus viverra ante. Donec eu egestas mauris. Morbi vulputate tincidunt euismod. Integer vel porttitor neque. @@ -430,13 +433,13 @@

                      1.1.1.1.1. <

                      -

                      2. Structural Elements 2

                      +

                      2. Structural Elements 2

                      Etiam turpis ante, luctus sed velit tristique, finibus volutpat dui. Nam sagittis vel ante nec malesuada. Praesent dignissim mi nec ornare elementum. Nunc eu augue vel sem dignissim cursus sed et nulla. Pellentesque habitant morbi tristique senectus et netus et malesuada fames ac turpis egestas. Pellentesque dictum dui sem, non placerat tortor rhoncus in. Sed placerat nulla at rhoncus iaculis.

                      -

                      2.1. Document Section

                      +

                      2.1. Document Section

                      Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed condimentum nulla vel neque venenatis, nec placerat lorem placerat. Cras purus eros, gravida vitae tincidunt id, vehicula nec nulla. Fusce aliquet auctor cursus. Phasellus ex neque, vestibulum non est vitae, viverra fringilla tortor. @@ -444,7 +447,7 @@

                      2.1.
                      -

                      2.1.1. Document Subsection

                      +

                      2.1.1. Document Subsection

                      ../../../_images/yi_jing_01_chien.jpg
                      @@ -536,11 +539,9 @@

                      2.1.1. - - + - - + diff --git a/pytorch-sphinx-theme/docs/index.html b/pytorch-sphinx-theme/docs/index.html index 50826522b..5507cb381 100644 --- a/pytorch-sphinx-theme/docs/index.html +++ b/pytorch-sphinx-theme/docs/index.html @@ -6,7 +6,7 @@ - + @@ -28,6 +28,8 @@ + + @@ -250,18 +252,19 @@ -

                      FBGEMM_GPU General Info

                      +

                      FBGEMM_GPU General Info

                      -

                      FBGEMM_GPU Python API

                      +

                      FBGEMM_GPU Python API

                      -

                      FBGEMM_GPU C++ API

                      +

                      FBGEMM_GPU C++ API

                      • Sparse Data Operators
                      • Quantization Operators
                      • @@ -349,7 +352,7 @@
                        -

                        Theme Documentation

                        +

                        Theme Documentation

                        -

                        Demo Documents

                        +

                        Demo Documents

                        -

                        This is an incredibly long caption for a long menu

                        +

                        This is an incredibly long caption for a long menu

                        • 1. Long Sticky Nav
                          • 1.1. Example Menu 1
                          • @@ -493,11 +496,9 @@ - - + - - + diff --git a/pytorch-sphinx-theme/docs/installing.html b/pytorch-sphinx-theme/docs/installing.html index 2106d7395..ab4ef5e65 100644 --- a/pytorch-sphinx-theme/docs/installing.html +++ b/pytorch-sphinx-theme/docs/installing.html @@ -6,7 +6,7 @@ - + @@ -28,6 +28,8 @@ + + @@ -250,18 +252,19 @@ -

                            FBGEMM_GPU General Info

                            +

                            FBGEMM_GPU General Info

                            -

                            FBGEMM_GPU Python API

                            +

                            FBGEMM_GPU Python API

                            -

                            FBGEMM_GPU C++ API

                            +

                            FBGEMM_GPU C++ API

                            • Sparse Data Operators
                            • Quantization Operators
                            • @@ -349,9 +352,9 @@
                              -

                              Installation

                              +

                              Installation

                              -

                              Via Git or Download

                              +

                              Via Git or Download

                              Symlink or subtree the pytorch_sphinx_theme repository into your documentation at docs/_themes/pytorch_sphinx_theme then add the following two settings to your Sphinx conf.py file:

                              @@ -415,11 +418,9 @@

                              Via Git or Download - - + - - + diff --git a/quantize__bfloat16_8cu.html b/quantize__bfloat16_8cu.html new file mode 100644 index 000000000..2489c5b92 --- /dev/null +++ b/quantize__bfloat16_8cu.html @@ -0,0 +1,172 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/quantize_ops/quantize_bfloat16.cu File Reference + + + + + + + + + + + + +
                              + +
                              quantize_bfloat16.cu File Reference
                              +
                              +
                              +
                              #include "common.cuh"
                              +
                              + + + +

                              +Namespaces

                              namespace  fbgemm_gpu
                               
                              + + + + + +

                              +Functions

                              DLL_PUBLIC at::Tensor _float_to_bfloat16_gpu (const at::Tensor &input)
                               
                              DLL_PUBLIC at::Tensor _bfloat16_to_float_gpu (const at::Tensor &input)
                               
                              +

                              Typedef Documentation

                              + +

                              ◆ Tensor

                              + +
                              +
                              + + + + +
                              using Tensor = at::Tensor
                              +
                              + +
                              +
                              +

                              Function Documentation

                              + +

                              ◆ FBGEMM_OP_DISPATCH() [1/2]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "Bfloat16QuantizedToFloat" ,
                              fbgemm_gpu::_bfloat16_to_float_gpu  )
                              +
                              + +
                              +
                              + +

                              ◆ FBGEMM_OP_DISPATCH() [2/2]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "FloatToBfloat16Quantized" ,
                              fbgemm_gpu::_float_to_bfloat16_gpu  )
                              +
                              + +
                              +
                              +
                              + + + + diff --git a/quantize__fp8__rowwise_8cu.html b/quantize__fp8__rowwise_8cu.html new file mode 100644 index 000000000..40bd56491 --- /dev/null +++ b/quantize__fp8__rowwise_8cu.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/quantize_ops/quantize_fp8_rowwise.cu File Reference + + + + + + + + + + + +
                              +
                              + + + + + + +
                              +
                              fbgemm_gpu +
                              +
                              +
                              + + + + + + + + +
                              +
                              + + +
                              +
                              +
                              +
                              +
                              +
                              Loading...
                              +
                              Searching...
                              +
                              No Matches
                              +
                              +
                              +
                              +
                              + + +
                              +
                              + +
                              quantize_fp8_rowwise.cu File Reference
                              +
                              +
                              +
                              #include "common.cuh"
                              +
                              + + + +

                              +Namespaces

                              namespace  fbgemm_gpu
                               
                              +

                              Typedef Documentation

                              + +

                              ◆ Tensor

                              + +
                              +
                              + + + + +
                              using Tensor = at::Tensor
                              +
                              + +
                              +
                              +
                              + + + + diff --git a/quantize__fused__8bit__rowwise_8cu.html b/quantize__fused__8bit__rowwise_8cu.html new file mode 100644 index 000000000..16a21dab2 --- /dev/null +++ b/quantize__fused__8bit__rowwise_8cu.html @@ -0,0 +1,294 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/quantize_ops/quantize_fused_8bit_rowwise.cu File Reference + + + + + + + + + + + +
                              +
                              + + + + + + +
                              +
                              fbgemm_gpu +
                              +
                              +
                              + + + + + + + + +
                              +
                              + + +
                              +
                              +
                              +
                              +
                              +
                              Loading...
                              +
                              Searching...
                              +
                              No Matches
                              +
                              +
                              +
                              +
                              + + +
                              +
                              + +
                              quantize_fused_8bit_rowwise.cu File Reference
                              +
                              +
                              +
                              #include "common.cuh"
                              +
                              + + + +

                              +Namespaces

                              namespace  fbgemm_gpu
                               
                              +

                              Typedef Documentation

                              + +

                              ◆ Tensor

                              + +
                              +
                              + + + + +
                              using Tensor = at::Tensor
                              +
                              + +
                              +
                              +

                              Function Documentation

                              + +

                              ◆ FBGEMM_OP_DISPATCH() [1/7]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "FloatOrHalfToFused8BitRowwiseQuantized" ,
                              fbgemm_gpu::_single_or_half_precision_to_fused8bitrowwise_gpu  )
                              +
                              + +
                              +
                              + +

                              ◆ FBGEMM_OP_DISPATCH() [2/7]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "FloatToFused8BitRowwiseQuantized" ,
                              fbgemm_gpu::_float_to_fused8bitrowwise_gpu  )
                              +
                              + +
                              +
                              + +

                              ◆ FBGEMM_OP_DISPATCH() [3/7]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "Fused8BitRowwiseQuantizedToFloat" ,
                              fbgemm_gpu::_fused8bitrowwise_to_float_gpu  )
                              +
                              + +
                              +
                              + +

                              ◆ FBGEMM_OP_DISPATCH() [4/7]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "Fused8BitRowwiseQuantizedToFloatMixedDim" ,
                              fbgemm_gpu::_fused8bitrowwise_to_float_mixed_dim_gpu  )
                              +
                              + +
                              +
                              + +

                              ◆ FBGEMM_OP_DISPATCH() [5/7]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "Fused8BitRowwiseQuantizedToFloatOrHalf" ,
                              fbgemm_gpu::_fused8bitrowwise_to_single_or_half_precision_gpu  )
                              +
                              + +
                              +
                              + +

                              ◆ FBGEMM_OP_DISPATCH() [6/7]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "Fused8BitRowwiseQuantizedToHalf" ,
                              fbgemm_gpu::_fused8bitrowwise_to_half_gpu  )
                              +
                              + +
                              +
                              + +

                              ◆ FBGEMM_OP_DISPATCH() [7/7]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "HalfToFused8BitRowwiseQuantized" ,
                              fbgemm_gpu::_half_to_fused8bitrowwise_gpu  )
                              +
                              + +
                              +
                              +
                              + + + + diff --git a/quantize__fused__nbit__rowwise_8cu.html b/quantize__fused__nbit__rowwise_8cu.html new file mode 100644 index 000000000..a1b0968f1 --- /dev/null +++ b/quantize__fused__nbit__rowwise_8cu.html @@ -0,0 +1,268 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/quantize_ops/quantize_fused_nbit_rowwise.cu File Reference + + + + + + + + + + + +
                              +
                              + + + + + + +
                              +
                              fbgemm_gpu +
                              +
                              +
                              + + + + + + + + +
                              +
                              + + +
                              +
                              +
                              +
                              +
                              +
                              Loading...
                              +
                              Searching...
                              +
                              No Matches
                              +
                              +
                              +
                              +
                              + + +
                              +
                              + +
                              quantize_fused_nbit_rowwise.cu File Reference
                              +
                              +
                              +
                              #include "common.cuh"
                              +
                              + + + +

                              +Namespaces

                              namespace  fbgemm_gpu
                               
                              +

                              Typedef Documentation

                              + +

                              ◆ Tensor

                              + +
                              +
                              + + + + +
                              using Tensor = at::Tensor
                              +
                              + +
                              +
                              +

                              Function Documentation

                              + +

                              ◆ FBGEMM_OP_DISPATCH() [1/6]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf" ,
                              fbgemm_gpu::_float_or_half_to_fusednbitrowwise_gpu  )
                              +
                              + +
                              +
                              + +

                              ◆ FBGEMM_OP_DISPATCH() [2/6]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "FloatToFusedNBitRowwiseQuantizedSBHalf" ,
                              fbgemm_gpu::_float_to_fusednbitrowwise_gpu  )
                              +
                              + +
                              +
                              + +

                              ◆ FBGEMM_OP_DISPATCH() [3/6]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "FusedNBitRowwiseQuantizedSBHalfToFloat" ,
                              fbgemm_gpu::_fusednbitrowwise_to_float_gpu  )
                              +
                              + +
                              +
                              + +

                              ◆ FBGEMM_OP_DISPATCH() [4/6]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf" ,
                              fbgemm_gpu::_fusednbitrowwise_to_float_or_half_gpu  )
                              +
                              + +
                              +
                              + +

                              ◆ FBGEMM_OP_DISPATCH() [5/6]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "FusedNBitRowwiseQuantizedSBHalfToHalf" ,
                              fbgemm_gpu::_fusednbitrowwise_to_half_gpu  )
                              +
                              + +
                              +
                              + +

                              ◆ FBGEMM_OP_DISPATCH() [6/6]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "HalfToFusedNBitRowwiseQuantizedSBHalf" ,
                              fbgemm_gpu::_half_to_fusednbitrowwise_gpu  )
                              +
                              + +
                              +
                              +
                              + + + + diff --git a/quantize__hfp8_8cu.html b/quantize__hfp8_8cu.html new file mode 100644 index 000000000..b8b31b2eb --- /dev/null +++ b/quantize__hfp8_8cu.html @@ -0,0 +1,172 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/quantize_ops/quantize_hfp8.cu File Reference + + + + + + + + + + + +
                              +
                              + + + + + + +
                              +
                              fbgemm_gpu +
                              +
                              +
                              + + + + + + + + +
                              +
                              + + +
                              +
                              +
                              +
                              +
                              +
                              Loading...
                              +
                              Searching...
                              +
                              No Matches
                              +
                              +
                              +
                              +
                              + + +
                              +
                              + +
                              quantize_hfp8.cu File Reference
                              +
                              +
                              +
                              #include "common.cuh"
                              +
                              + + + +

                              +Namespaces

                              namespace  fbgemm_gpu
                               
                              + + + + + +

                              +Functions

                              DLL_PUBLIC at::Tensor _float_to_hfp8_gpu (const at::Tensor &input, const int64_t ebits, const int64_t exponent_bias, const double max_pos)
                               
                              DLL_PUBLIC at::Tensor _hfp8_to_float_gpu (const at::Tensor &input, const int64_t ebits, const int64_t exponent_bias)
                               
                              +

                              Typedef Documentation

                              + +

                              ◆ Tensor

                              + +
                              +
                              + + + + +
                              using Tensor = at::Tensor
                              +
                              + +
                              +
                              +

                              Function Documentation

                              + +

                              ◆ FBGEMM_OP_DISPATCH() [1/2]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "FloatToHFP8Quantized" ,
                              fbgemm_gpu::_float_to_hfp8_gpu  )
                              +
                              + +
                              +
                              + +

                              ◆ FBGEMM_OP_DISPATCH() [2/2]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "HFP8QuantizedToFloat" ,
                              fbgemm_gpu::_hfp8_to_float_gpu  )
                              +
                              + +
                              +
                              +
                              + + + + diff --git a/quantize__msfp_8cu.html b/quantize__msfp_8cu.html new file mode 100644 index 000000000..a2ef4eab7 --- /dev/null +++ b/quantize__msfp_8cu.html @@ -0,0 +1,172 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/quantize_ops/quantize_msfp.cu File Reference + + + + + + + + + + + +
                              +
                              + + + + + + +
                              +
                              fbgemm_gpu +
                              +
                              +
                              + + + + + + + + +
                              +
                              + + +
                              +
                              +
                              +
                              +
                              +
                              Loading...
                              +
                              Searching...
                              +
                              No Matches
                              +
                              +
                              +
                              +
                              + + +
                              +
                              + +
                              quantize_msfp.cu File Reference
                              +
                              +
                              +
                              #include "common.cuh"
                              +
                              + + + +

                              +Namespaces

                              namespace  fbgemm_gpu
                               
                              + + + + + +

                              +Functions

                              DLL_PUBLIC at::Tensor _float_to_msfp_gpu (const at::Tensor &input, const int64_t bounding_box_size, const int64_t ebits, const int64_t mbits, const int64_t bias, const double min_pos, const double max_pos)
                               
                              DLL_PUBLIC at::Tensor _msfp_to_float_gpu (const at::Tensor &input, const int64_t ebits, const int64_t mbits, const int64_t bias)
                               
                              +

                              Typedef Documentation

                              + +

                              ◆ Tensor

                              + +
                              +
                              + + + + +
                              using Tensor = at::Tensor
                              +
                              + +
                              +
                              +

                              Function Documentation

                              + +

                              ◆ FBGEMM_OP_DISPATCH() [1/2]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "FloatToMSFPQuantized" ,
                              fbgemm_gpu::_float_to_msfp_gpu  )
                              +
                              + +
                              +
                              + +

                              ◆ FBGEMM_OP_DISPATCH() [2/2]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "MSFPQuantizedToFloat" ,
                              fbgemm_gpu::_msfp_to_float_gpu  )
                              +
                              + +
                              +
                              +
                              + + + + diff --git a/quantize__ops_2common_8cuh.html b/quantize__ops_2common_8cuh.html new file mode 100644 index 000000000..5ddca65c2 --- /dev/null +++ b/quantize__ops_2common_8cuh.html @@ -0,0 +1,162 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/quantize_ops/common.cuh File Reference + + + + + + + + + + + +
                              +
                              + + + + + + +
                              +
                              fbgemm_gpu +
                              +
                              +
                              + + + + + + + + +
                              +
                              + + +
                              +
                              +
                              +
                              +
                              +
                              Loading...
                              +
                              Searching...
                              +
                              No Matches
                              +
                              +
                              +
                              +
                              + + +
                              +
                              +
                              common.cuh File Reference
                              +
                              +
                              +
                              #include <ATen/TensorIterator.h>
                              +#include <ATen/cuda/Exceptions.h>
                              +#include <c10/cuda/CUDAGuard.h>
                              +#include <math_constants.h>
                              +#include <ATen/ATen.h>
                              +#include <ATen/TensorUtils.h>
                              +#include <ATen/core/TensorAccessor.h>
                              +#include <ATen/native/TensorIterator.h>
                              +#include <ATen/native/cuda/Loops.cuh>
                              +#include "fbgemm_gpu/dispatch_macros.h"
                              +#include "fbgemm_gpu/embedding_common.h"
                              +#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
                              +#include "fbgemm_gpu/ops_utils.h"
                              +#include "fbgemm_gpu/quantize_ops.cuh"
                              +#include "fbgemm_gpu/quantize_ops_utils.h"
                              +#include "fbgemm_gpu/sparse_ops.h"
                              +#include "fbgemm_gpu/sparse_ops_utils.h"
                              +

                              Macro Definition Documentation

                              + +

                              ◆ QUANTIZE_OPS_MAX

                              + +
                              +
                              + + + + + + + + + + + +
                              #define QUANTIZE_OPS_MAX( a,
                              b )   ((a) > (b) ? (a) : (b))
                              +
                              + +
                              +
                              + +

                              ◆ QUANTIZE_OPS_MIN

                              + +
                              +
                              + + + + + + + + + + + +
                              #define QUANTIZE_OPS_MIN( a,
                              b )   ((a) < (b) ? (a) : (b))
                              +
                              + +
                              +
                              +

                              Typedef Documentation

                              + +

                              ◆ Tensor

                              + +
                              +
                              + + + + +
                              using Tensor = at::Tensor
                              +
                              + +
                              +
                              +
                              + + + + diff --git a/quantize__ops_8cuh.html b/quantize__ops_8cuh.html new file mode 100644 index 000000000..986956dcc --- /dev/null +++ b/quantize__ops_8cuh.html @@ -0,0 +1,95 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/quantize_ops.cuh File Reference + + + + + + + + + + + +
                              +
                              + + + + + + +
                              +
                              fbgemm_gpu +
                              +
                              +
                              + + + + + + + + +
                              +
                              + + +
                              +
                              +
                              +
                              +
                              +
                              Loading...
                              +
                              Searching...
                              +
                              No Matches
                              +
                              +
                              +
                              +
                              + + +
                              +
                              + +
                              quantize_ops.cuh File Reference
                              +
                              +
                              + + + + +

                              +Namespaces

                              namespace  fbgemm_gpu
                               
                              +
                              + + + + diff --git a/quantize__ops__cpu_8cpp.html b/quantize__ops__cpu_8cpp.html new file mode 100644 index 000000000..a54d5a158 --- /dev/null +++ b/quantize__ops__cpu_8cpp.html @@ -0,0 +1,167 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/quantize_ops/quantize_ops_cpu.cpp File Reference + + + + + + + + + + + +
                              +
                              + + + + + + +
                              +
                              fbgemm_gpu +
                              +
                              +
                              + + + + + + + + +
                              +
                              + + +
                              +
                              +
                              +
                              +
                              +
                              Loading...
                              +
                              Searching...
                              +
                              No Matches
                              +
                              +
                              +
                              +
                              + + +
                              +
                              + +
                              quantize_ops_cpu.cpp File Reference
                              +
                              +
                              +
                              #include <ATen/ATen.h>
                              +#include <ATen/core/op_registration/op_registration.h>
                              +#include <fbgemm_gpu/sparse_ops.h>
                              +#include <fbgemm_gpu/sparse_ops_utils.h>
                              +#include <torch/library.h>
                              +#include "fbgemm/QuantUtils.h"
                              +#include "fbgemm_gpu/dispatch_macros.h"
                              +#include "fbgemm_gpu/embedding_common.h"
                              +#include "fbgemm_gpu/quantize_ops_utils.h"
                              +
                              + + + +

                              +Namespaces

                              namespace  fbgemm_gpu
                               
                              +

                              Typedef Documentation

                              + +

                              ◆ Tensor

                              + +
                              +
                              + + + + +
                              using Tensor = at::Tensor
                              +
                              + +
                              +
                              +

                              Function Documentation

                              + +

                              ◆ TORCH_LIBRARY_FRAGMENT()

                              + +
                              +
                              + + + + + + + + + + + +
                              TORCH_LIBRARY_FRAGMENT (fbgemm ,
                              m  )
                              +
                              + +
                              +
                              + +

                              ◆ TORCH_LIBRARY_IMPL()

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              TORCH_LIBRARY_IMPL (fbgemm ,
                              CPU ,
                              m  )
                              +
                              + +
                              +
                              +
                              + + + + diff --git a/quantize__ops__gpu_8cpp.html b/quantize__ops__gpu_8cpp.html new file mode 100644 index 000000000..479c4d991 --- /dev/null +++ b/quantize__ops__gpu_8cpp.html @@ -0,0 +1,195 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/quantize_ops/quantize_ops_gpu.cpp File Reference + + + + + + + + + + + +
                              +
                              + + + + + + +
                              +
                              fbgemm_gpu +
                              +
                              +
                              + + + + + + + + +
                              +
                              + + +
                              +
                              +
                              +
                              +
                              +
                              Loading...
                              +
                              Searching...
                              +
                              No Matches
                              +
                              +
                              +
                              +
                              + + +
                              +
                              +
                              quantize_ops_gpu.cpp File Reference
                              +
                              +
                              +

                              Function Documentation

                              + +

                              ◆ FBGEMM_OP_DISPATCH() [1/4]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "FloatToFP8RowwiseQuantized" ,
                              fbgemm_gpu::_float_to_FP8rowwise_gpu  )
                              +
                              + +
                              +
                              + +

                              ◆ FBGEMM_OP_DISPATCH() [2/4]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "FloatToPaddedFP8RowwiseQuantized" ,
                              fbgemm_gpu::_float_to_paddedFP8rowwise_gpu  )
                              +
                              + +
                              +
                              + +

                              ◆ FBGEMM_OP_DISPATCH() [3/4]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "FP8RowwiseQuantizedToFloat" ,
                              fbgemm_gpu::_FP8rowwise_to_float_gpu  )
                              +
                              + +
                              +
                              + +

                              ◆ FBGEMM_OP_DISPATCH() [4/4]

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              FBGEMM_OP_DISPATCH (CUDA ,
                              "PaddedFP8RowwiseQuantizedToFloat" ,
                              fbgemm_gpu::_paddedFP8rowwise_to_float_gpu  )
                              +
                              + +
                              +
                              +
                              + + + + diff --git a/quantize__ops__meta_8cpp.html b/quantize__ops__meta_8cpp.html new file mode 100644 index 000000000..3594e355d --- /dev/null +++ b/quantize__ops__meta_8cpp.html @@ -0,0 +1,144 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/quantize_ops/quantize_ops_meta.cpp File Reference + + + + + + + + + + + +
                              +
                              + + + + + + +
                              +
                              fbgemm_gpu +
                              +
                              +
                              + + + + + + + + +
                              +
                              + + +
                              +
                              +
                              +
                              +
                              +
                              Loading...
                              +
                              Searching...
                              +
                              No Matches
                              +
                              +
                              +
                              +
                              + + +
                              +
                              + +
                              quantize_ops_meta.cpp File Reference
                              +
                              +
                              +
                              #include <ATen/ATen.h>
                              +#include <ATen/core/op_registration/op_registration.h>
                              +#include <torch/library.h>
                              +#include "c10/core/ScalarType.h"
                              +#include "fbgemm_gpu/embedding_common.h"
                              +#include "fbgemm_gpu/sparse_ops.h"
                              +#include "fbgemm_gpu/sparse_ops_utils.h"
                              +
                              + + + +

                              +Namespaces

                              namespace  fbgemm_gpu
                               
                              +

                              Typedef Documentation

                              + +

                              ◆ Tensor

                              + +
                              +
                              + + + + +
                              using Tensor = at::Tensor
                              +
                              + +
                              +
                              +

                              Function Documentation

                              + +

                              ◆ TORCH_LIBRARY_IMPL()

                              + +
                              +
                              + + + + + + + + + + + + + + + + +
                              TORCH_LIBRARY_IMPL (fbgemm ,
                              Meta ,
                              m  )
                              +
                              + +
                              +
                              +
                              + + + + diff --git a/quantize__ops__utils_8h.html b/quantize__ops__utils_8h.html new file mode 100644 index 000000000..f013c0ff3 --- /dev/null +++ b/quantize__ops__utils_8h.html @@ -0,0 +1,96 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/quantize_ops_utils.h File Reference + + + + + + + + + + + +
                              +
                              + + + + + + +
                              +
                              fbgemm_gpu +
                              +
                              +
                              + + + + + + + + +
                              +
                              + + +
                              +
                              +
                              +
                              +
                              +
                              Loading...
                              +
                              Searching...
                              +
                              No Matches
                              +
                              +
                              +
                              +
                              + + +
                              +
                              + +
                              quantize_ops_utils.h File Reference
                              +
                              +
                              +
                              #include <ATen/ATen.h>
                              +
                              + + + +

                              +Namespaces

                              namespace  fbgemm_gpu
                               
                              +
                              + + + + diff --git a/quantize__padded__fp8__rowwise_8cu.html b/quantize__padded__fp8__rowwise_8cu.html new file mode 100644 index 000000000..d9a3effb9 --- /dev/null +++ b/quantize__padded__fp8__rowwise_8cu.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/quantize_ops/quantize_padded_fp8_rowwise.cu File Reference + + + + + + + + + + + +
                              +
                              + + + + + + +
                              +
                              fbgemm_gpu +
                              +
                              +
                              + + + + + + + + +
                              +
                              + + +
                              +
                              +
                              +
                              +
                              +
                              Loading...
                              +
                              Searching...
                              +
                              No Matches
                              +
                              +
                              +
                              +
                              + + +
                              +
                              + +
                              quantize_padded_fp8_rowwise.cu File Reference
                              +
                              +
                              +
                              #include "common.cuh"
                              +
                              + + + +

                              +Namespaces

                              namespace  fbgemm_gpu
                               
                              +

                              Typedef Documentation

                              + +

                              ◆ Tensor

                              + +
                              +
                              + + + + +
                              using Tensor = at::Tensor
                              +
                              + +
                              +
                              +
                              + + + + diff --git a/radix__sort__pairs_8cu.html b/radix__sort__pairs_8cu.html new file mode 100644 index 000000000..ad954812a --- /dev/null +++ b/radix__sort__pairs_8cu.html @@ -0,0 +1,246 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_utils/radix_sort_pairs.cu File Reference + + + + + + + + + + + +
                              +
                              + + + + + + +
                              +
                              fbgemm_gpu +
                              +
                              +
                              + + + + + + + + +
                              +
                              + + +
                              +
                              +
                              +
                              +
                              +
                              Loading...
                              +
                              Searching...
                              +
                              No Matches
                              +
                              +
                              +
                              +
                              + + +
                              +
                              +
                              radix_sort_pairs.cu File Reference
                              +
                              +
                              +
                              #include "fbgemm_gpu/split_embeddings_utils.cuh"
                              +#include <c10/cuda/CUDAException.h>
                              +#include <c10/cuda/CUDAStream.h>
                              +#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
                              +#include "fbgemm_gpu/ops_utils.h"
                              +#include "fbgemm_gpu/cub_namespace_prefix.cuh"
                              +#include <cub/device/device_radix_sort.cuh>
                              +#include <cub/device/device_run_length_encode.cuh>
                              +#include <cub/device/device_scan.cuh>
                              +#include "fbgemm_gpu/cub_namespace_postfix.cuh"
                              +

                              Macro Definition Documentation

                              + +

                              ◆ DEF_RADIX_SORT_PAIRS_FN

                              + +
                              +
                              + + + + + + + + + + + +
                              #define DEF_RADIX_SORT_PAIRS_FN( KeyT,
                              ValueT )
                              +
                              +Value:
                              +
                              void* d_temp_storage, \
                              +
                              size_t& temp_storage_bytes, \
                              +
                              const KeyT* d_keys_in, \
                              + +
                              const ValueT* d_values_in, \
                              + +
                              const int num_items, \
                              +
                              const int begin_bit, \
                              +
                              const int end_bit, \
                              + +
                              return FBGEMM_GPU_CUB_NS_PREFIX cub::DeviceRadixSort::SortPairs( \
                              + + + + + + + + + +
                              stream, \
                              +
                              false); \
                              +
                              }
                              +
                              Definition fbgemm_tensor_accessor.h:128
                              +
                              #define FBGEMM_GPU_CUB_NS_PREFIX
                              Definition cub_namespace_postfix.cuh:34
                              +
                              #define DLL_PUBLIC
                              Definition ops_utils.h:39
                              +
                              +
                              +
                              +

                              Typedef Documentation

                              + +

                              ◆ Tensor

                              + +
                              +
                              + + + + +
                              using Tensor = at::Tensor
                              +
                              + +
                              +
                              +

                              Function Documentation

                              + +

                              ◆ DEF_RADIX_SORT_PAIRS_FN() [1/4]

                              + +
                              +
                              + + + + + + + + + + + +
                              DEF_RADIX_SORT_PAIRS_FN (int64_t ,
                              double  )
                              +
                              + +
                              +
                              + +

                              ◆ DEF_RADIX_SORT_PAIRS_FN() [2/4]

                              + +
                              +
                              + + + + + + + + + + + +
                              DEF_RADIX_SORT_PAIRS_FN (int64_t ,
                              float  )
                              +
                              + +
                              +
                              + +

                              ◆ DEF_RADIX_SORT_PAIRS_FN() [3/4]

                              + +
                              +
                              + + + + + + + + + + + +
                              DEF_RADIX_SORT_PAIRS_FN (int64_t ,
                              int32_t  )
                              +
                              + +
                              +
                              + +

                              ◆ DEF_RADIX_SORT_PAIRS_FN() [4/4]

                              + +
                              +
                              + + + + + + + + + + + +
                              DEF_RADIX_SORT_PAIRS_FN (int64_t ,
                              int64_t  )
                              +
                              + +
                              +
                              +
                              + + + + diff --git a/reset__weight__momentum_8cu.html b/reset__weight__momentum_8cu.html new file mode 100644 index 000000000..a302d86da --- /dev/null +++ b/reset__weight__momentum_8cu.html @@ -0,0 +1,103 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_cache/reset_weight_momentum.cu File Reference + + + + + + + + + + + +
                              +
                              + + + + + + +
                              +
                              fbgemm_gpu +
                              +
                              +
                              + + + + + + + + +
                              +
                              + + +
                              +
                              +
                              +
                              +
                              +
                              Loading...
                              +
                              Searching...
                              +
                              No Matches
                              +
                              +
                              +
                              +
                              + + +
                              +
                              +
                              reset_weight_momentum.cu File Reference
                              +
                              +
                              +
                              #include "common.cuh"
                              +

                              Typedef Documentation

                              + +

                              ◆ Tensor

                              + +
                              +
                              + + + + +
                              using Tensor = at::Tensor
                              +
                              + +
                              +
                              +
                              + + + + diff --git a/search.html b/search.html index 297368a34..8d1c8d7a4 100644 --- a/search.html +++ b/search.html @@ -27,6 +27,8 @@ + + @@ -249,18 +251,19 @@ -

                              FBGEMM_GPU General Info

                              +

                              FBGEMM_GPU General Info

                              -

                              FBGEMM_GPU Python API

                              +

                              FBGEMM_GPU Python API

                              -

                              FBGEMM_GPU C++ API

                              +

                              FBGEMM_GPU C++ API

                              • Sparse Data Operators
                              • Quantization Operators
                              • @@ -405,11 +408,9 @@ - - + - - + diff --git a/search/all_0.js b/search/all_0.js index 41eb52c4a..a844e2673 100644 --- a/search/all_0.js +++ b/search/all_0.js @@ -1,9 +1,70 @@ var searchData= [ - ['_5fbfloat16_5fto_5ffloat_5fgpu_0',['_bfloat16_to_float_gpu',['../group__quantize-ops-cuda.html#ga2076a59fd190690f67c1eddb79b6acc4',1,'fbgemm_gpu']]], - ['_5ffloat_5fto_5fbfloat16_5fgpu_1',['_float_to_bfloat16_gpu',['../group__quantize-ops-cuda.html#ga2f1cc4b6dc6f708324855f94d558cfc1',1,'fbgemm_gpu']]], - ['_5ffloat_5fto_5fhfp8_5fgpu_2',['_float_to_hfp8_gpu',['../group__quantize-ops-cuda.html#gab2837424e3774fe34ba255658554a75a',1,'fbgemm_gpu']]], - ['_5ffloat_5fto_5fmsfp_5fgpu_3',['_float_to_msfp_gpu',['../group__quantize-ops-cuda.html#ga427f81e1d8901e2fafc9611860fbd4d5',1,'fbgemm_gpu']]], - ['_5fhfp8_5fto_5ffloat_5fgpu_4',['_hfp8_to_float_gpu',['../group__quantize-ops-cuda.html#ga03a8f8825a16c6235b699886fa46e1f6',1,'fbgemm_gpu']]], - ['_5fmsfp_5fto_5ffloat_5fgpu_5',['_msfp_to_float_gpu',['../group__quantize-ops-cuda.html#gac0c20377454dbfafcc5ac245fe6427ce',1,'fbgemm_gpu']]] + ['_5f_5falign_5f_5f_0',['__align__',['../namespacefbgemm__gpu.html#a9a25aa8cfdd2801c4576fb7111ca1e34',1,'fbgemm_gpu::__align__(32) float8'],['../namespacefbgemm__gpu.html#ac5ef7f218ca22e4dd93d4161458006f6',1,'fbgemm_gpu::__align__(64) float_16'],['../namespacefbgemm__gpu.html#a5365b81a771afde2d770210e45b73bdb',1,'fbgemm_gpu::__align__(8) half4'],['../namespacefbgemm__gpu.html#ad5af23eb5e28d14f6089e7a18b0ed0d5',1,'fbgemm_gpu::__align__(16) half8']]], + ['_5f_5fhalf2_5fto_5fui_1',['__HALF2_TO_UI',['../fbgemm__cuda__utils_8cuh.html#ab78d230e0bbda883a8f34ca1e31d0929',1,'fbgemm_cuda_utils.cuh']]], + ['_5f_5fhas_5finclude_2',['__has_include',['../_c_make_c_compiler_id_8c.html#ae5510d82e4946f1656f4969911c54736',1,'__has_include: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#ae5510d82e4946f1656f4969911c54736',1,'__has_include: CMakeCXXCompilerId.cpp']]], + ['_5f_5flaunch_5fbounds_5f_5f_3',['__launch_bounds__',['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#a6e4504b4f1023565bf18ac29f304f165',1,'__launch_bounds__(kMaxThreads) void batch_index_select_dim0_codegen_backward_kernel_cta_per_row(const pta: gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#a419781019c14d9d59041ca2a127d2c1a',1,'__launch_bounds__(kMaxThreads) void batch_index_select_dim0_codegen_backward_kernel_cta_per_row< uint8_t: gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__warp_8cu.html#af1eb0a147a3656c72bff10b68454c23b',1,'__launch_bounds__(kBackwardMaxThreads) void batch_index_select_dim0_codegen_backward_kernel_warp_per_row(const pta: gen_batch_index_select_dim0_backward_kernel_warp.cu'],['../gen__batch__index__select__dim0__backward__kernel__warp_8cu.html#a422cac14ead186e7d1ffdea24dbb41a2',1,'__launch_bounds__(kBackwardMaxThreads) void batch_index_select_dim0_codegen_backward_kernel_warp_per_row< uint8_t: gen_batch_index_select_dim0_backward_kernel_warp.cu'],['../gen__batch__index__select__dim0__forward__kernel_8cu.html#afe75d514238f01862b4416d072a457ab',1,'__launch_bounds__(kForwardMaxThreads) __global__ void batch_index_select_dim0_codegen_forward_kernel(const pta: gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__batch__index__select__dim0__forward__kernel_8cu.html#a794e5a8311030e080f19bcaf98cbaa3e',1,'__launch_bounds__(kForwardMaxThreads) __global__ void batch_index_select_dim0_codegen_forward_kernel< uint8_t: gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html#aab67c7ff63262ed7ee2955ab54fd6cdb',1,'__launch_bounds__(kForwardMaxThreads) __global__ void batch_index_select_dim0_codegen_forward_small_kernel(const pta: gen_batch_index_select_dim0_forward_kernel_small.cu'],['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html#a09ab46cf824219bc6c7ca9a47e3d90cd',1,'__launch_bounds__(kForwardMaxThreads) __global__ void batch_index_select_dim0_codegen_forward_small_kernel< uint8_t: gen_batch_index_select_dim0_forward_kernel_small.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#aec97e553558684266790dc906158a105',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adagrad_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#a60482659dcb929a1f6a60dda564f4cdc',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adagrad_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#a50cb7dfbe0185fcbd26cfd0156710acc',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adagrad_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#a0ed9968b042349d756a20bfc8c31c22d',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adagrad_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a7f198a235aa56925b36d48d029f9a26a',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_adagrad_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#abb3af3ab6c99e8609b2199129b2a6c3d',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_adagrad_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a2ca5c0c3b7f03146b0739206987a8efb',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_adagrad_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#ad265ff9fd07f592055eb413d73ff59a3',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_adagrad_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#a479b62e3a680d0eb604b0d99c497dc44',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adagrad_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#a16936797cd22aeea32b40dcc55e1d73f',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adagrad_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#a665c5d75524a34cec6f5b5258b182d7a',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adagrad_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#a7729be76298454212379af9803e78cf9',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adagrad_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#aeb6425d7cade524ae83445d8ffcad95a',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adam_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#ada6a5fbef27c4a4a31a9b8794e15442e',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adam_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#a8a0814be275ca40dd482231bf8be61ef',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adam_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#abcaa8e0b99a97add31e16f0454bd57d3',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adam_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#a30fd75bf7de9f2dd4c1af90a76cc4cab',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_adam_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#a43a11629fc716aa3fc2efce282ade1bf',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_adam_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#a1e6a8699bf2c46477da50582e38ee237',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_adam_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#ae3a9242f5ffd888400f08b8c1662cc61',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_adam_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#aa7724fd36f338edda8cec8fbce0dcc3f',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adam_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#aaa0f0d28eaca058bde829af48b4a9b93',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adam_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#a141a421e122929281f3a968d7181075d',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adam_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#aa53241ccd067fda3b4f745364d104ae7',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adam_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#aef74039cc67d8a29f2964dd2ead5c884',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a8005f4419a0e99b1adc8ba836e2bacc4',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#af4e9ad9da78c796024828e400596398e',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#abbbfbac2a0d5a12edfd4fa6e476f5089',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a727c25d68451d781ee3328a76b544770',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a942a80794035682b67bf75531af7ea76',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a02950b6e35152a847c545ef90af6c315',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a46c9fa7a8cf628e30c5bcbd6713846b2',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#adf6d412fe63bcfdcd84fc4e45f616217',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a360c18a2f091431cf7f15e6ac14e848a',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a60ab111bc496bd3b843b3d73350f6695',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#abe773e17b7f19a70a10efe7bf1763c07',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__indice__weights__codegen__cuda_8cu.html#ad49c5c5e6c69ba836c2c3728d383cd5c',1,'__launch_bounds__(kForwardMaxThreads) void dense_embedding_codegen_grad_indice_weights_kernel(const pta: gen_embedding_backward_dense_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#a71d10fab767a3f6a4c9845432b7c673b',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_dense_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#a134107427281e66b9bdc1f05e0ed2006',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_dense_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__warp_8cu.html#a321e6c7a5bc2c920f083dadb4d023bae',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_dense_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_dense_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__warp_8cu.html#a705c39686bcf17986ce0182b31944a82',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_dense_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_dense_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#a8f6c6ea91c21be19960e453b8f83698b',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_dense_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#aa068d67521003fac6c5013d12698b228',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_dense_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__warp_8cu.html#a4d92990636a3fcdbe762a413cc96c642',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_dense_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_dense_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__warp_8cu.html#a9629d38b5ab429da94bb1d5099042123',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_dense_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_dense_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#a044189dd94a5b69db982c5e78a8258f4',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_dense_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#a44794beb7b535ee85a06027407e9578d',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_dense_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#aec8fd1dccb91dec69eee635d8cc8cae3',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_dense_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#ae68abaaf02536c2e20decd2ca4daef60',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_dense_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#aa9d0b42dc9a6b6c25005e5adc6a412e3',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lamb_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#af2bc792b1cf28a27ebfc0866b059fa81',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lamb_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#a502bf7dfa5a02ec71b77763a65ec91c5',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lamb_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#a8026675b09ae447bd48ab0a854ea28bf',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lamb_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#aac66a737c59bab7e9f767b1e38d5f1d3',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_lamb_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#afd63238f6b7c4a1e468568bda42bb3e0',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_lamb_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a4e2287d8d0e80b53a592337a64570d66',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_lamb_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a11d09ebd4c4b65fb35d265de845d73fc',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_lamb_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#afa2ba02eba70da5c0a8fdcd8509e7e77',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lamb_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#a61e7f43722eeda4e4234e1af525ae46e',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lamb_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#afcf8fbf4f5013c1082ce86fa5c3a5fd4',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lamb_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#ab184e11501d6d031e538c60ef66a8342',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lamb_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#a8ea3bce56ea941e3716f81220ab88fe5',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lars_sgd_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#aaa9702e86f8ed1788c7796017bdd404c',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lars_sgd_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#a0c5ac630cac3e582871b2521984d3691',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lars_sgd_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#af7be11e596974198a45beaacc4d9db0f',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lars_sgd_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a1be3f22e4eb6db21e09d922580c54faf',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_lars_sgd_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a84b86dee7ee70d3e3ba5ae6f466c6f0e',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_lars_sgd_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a89a6fc31913b2347216065f4655b82ff',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_lars_sgd_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a584ad4898a3e03f279eb3a39c419735e',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_lars_sgd_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#aa2038bd1822625bd55a38eed4240c39a',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lars_sgd_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#ae41fadf6abfe1e00dccedd18b90dab32',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lars_sgd_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#a9ba65eca59bd0b29e87b4adb5a444d1b',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lars_sgd_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#a60776cad67cb695e9768c1ce170aed12',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lars_sgd_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#adb6e98291bfdb46d09389b2b453e54b2',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_none_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#aab6d47d46fccfb5d973f8ff2a44bff7a',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_none_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__warp_8cu.html#a9d6d3f6a070db2a520adb97ff89e7f1c',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_none_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_none_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__warp_8cu.html#adda6d498fce399be1bb4ff6c884cd325',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_none_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_none_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#a3c38980139cb0d10bc2d195479a69fb4',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_none_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#a55340037f2150aa438d4cb6675412e7e',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_none_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__warp_8cu.html#ae53e64a9190921226cba0e54595de4af',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_none_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_none_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__warp_8cu.html#a6c8f5295879f30dac04285180744b05b',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_none_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_none_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#a49683c14f18b75fafd2a5ce3f90c7d61',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_none_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#a78a9d364ed7043a1412228b17a0406a1',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_none_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#acd2e3179752c56bfdde47a8ad7a00220',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_none_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#a4d483ee9ae74898f27f8070e41c4fced',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_none_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#a440dc2054a1346ad291f617540be2e25',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#afaf745a30243c0c755429f1b1d465f2d',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#ab435787b7738dff4daa1eca5ed8725dd',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#a809fb9fd9f5386090e58c2fdd7f05bbd',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#a250b8485cb708a3fe1d789613014b238',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#a4996180a982a92cb9151e2557777d77a',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#ae96e8ad601ac1adb859d3aec074bb439',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#ae28f985f3c5d59410f3fd6c2a99d9320',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#a59f00431d3950b72f6e7d89baf3fde0b',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#a2f233fc13ea7dbc092ed3c22b2bf1a7f',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#a965b9c456ca6a6dffb664f585401250d',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#a69d372f391200ef3cafedad093a5470f',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#ad23ff52f91efba0cbff48134c3a42bc4',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#aa9475900cea03cb0a61e0e16932e01a4',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#aeea6e4ebbd44a284f8e1078cf3efdaad',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#a75a810317afae4c2a93af95f80855d42',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a41774547fd61442443c1967f1a8e8b13',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#aa41bfc39f4114bbad7186e4b9b480da3',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a84d2573599cb14db8200acded518dd53',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#ad3410f599c95c3268541e72f9684f82b',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#aee08a6146cbf90f361a828e6d2ff4ede',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#a12bca8c5fdd115d24668beab2bb8ea27',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#ae203f025f99b18448dfd355a519c4121',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#a87a9718ff816d6e1bdd9dca8e067e341',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#a84dcaeb939254f551d6c356d1eca8747',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#af8829bdb0d543a40bb769900d36ea13e',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#ae9187ee78b193e34f92875da955dc6de',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#ae7b604d06f2afe4b8d99b94b6a7ca46f',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a65ebc941a004af813be547c2114c6eca',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#aaba75f921548599cff242a4033a381c9',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#ad0d4a168e8e591add8c872d4c2fff64a',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#afbc119c8f230ecbf041ca9d852021a4a',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#a634a690ed27c50d8308bcc0a9bf85acc',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_vbe_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#ad38fb7b8c66635da0517434c661ef2e2',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_vbe_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#ac9d49c8094b87daf6025d9195437119e',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_vbe_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#a86d693b20d7be5e068994e693d970104',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_vbe_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#ad7474c2dcf75a987f9526e730542ae16',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#a76005fdee1a342df4b951b9191967576',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#a73ddb7ffe3131b43c027bed87a21da0c',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#ad386be3805dc66bcebfcc75ae6ce20ce',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#a557205856561135a510a45e915bc0714',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_vbe_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#a866fa5e6f036f9befaef0a014527b214',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_vbe_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#a15977bf39e5dbde54bc2d1176a9272b9',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_vbe_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#a4c67d23288adf2fc636e9db4c30bfa5e',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_vbe_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#a18e29f7653534f3a75e41cf3056d2634',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#ad0f3c1412b7b4ddb2f3c5262b27f5b46',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#a56d820ab8e2e5c1e815ecbe5e906075e',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#aaecc4ec4c793272693a37f0e027dfb93',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#a254195fbabfff3c3ad9ba04db100afae',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#a2fdb05c57c2efe83f57ce0ccfe97f861',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#a28a51c35ffb6aac4d6b35c9b87960129',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#a303830fd0513ecd4eb232556376ad2ff',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#abacbb190c3b418788aa37c065b93e703',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#a6f94595430b5a0e8c1597b72f210095f',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#ae73b050da138bd46bcb186f630a45f1e',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#a0f05baa1d7dca3d78338fcd70e11487c',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a1296e33305fd2cde7e9e34e18e7e7905',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a49dd26094cead9644cbc35c29bb5bb21',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#a709a4f70083ce173ce40562aa52ad3c8',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#abf79428f3dcf0b60bcff9074d587aeaf',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a99c23e8020a9ae93a0d0d429c6940707',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a920aba769ec4eba77d74c4cce2f0aa5a',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#aacedf2a727684a316ae18abf5670f8e8',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a9354545fca8047a3359cc39269e4531f',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a158fb407fba50cda959d3a60cbc01d91',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a0344106c25fea0c6358540ff4bd536f8',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#ade29dc18e73de993e107177d9568fbdf',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a913d8fc72158bf301f064c0e60657a18',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#a79b96d6a0be54ea86ebd1cadeedd2068',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#a66e6adb0beac238f39d443dffa3c0161',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#ad4cbc31bac8a8d965f3549045cd85999',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#a30ae1e9efc40a515dca89e5e3ef46565',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a4987b540b661f1caa132231f415c45a9',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#abeb949f70e925c2f8011d973d75645fc',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a08d8db556761e8e68193b2cc8a32a1cc',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#ac9a5abe82611fbf748e346094a7b24b2',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#ae73620aca9ffc6e0cfd3b9cb594bdaf0',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#a8275b2b19c2713679e0404cfc50cfc4f',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#a2af51d716ed8d2b1a926e0f237b76f71',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#ad63ba5d695275d09b7f72a2e3fc6c124',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#afd015e1d0e79f14de8ed5bdf578c81df',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#abc14cf31cc4a8f906bc7f25d594fafc1',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#a63e7a313c891f643c307bd05041a5b54',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#aaddcf08714b3cc33953d207c24e0be7f',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a320b7cb4717a06125d1e05149e7414a9',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_sgd_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a6736b927e85af06f2a8f64b95a527f35',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_sgd_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#aec3f0f560b496881e95413f483dc0c32',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_sgd_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a1bac18bde859aad7fbfb3871a0bacf37',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_sgd_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#aaa0317297f080a5b537f22049d8ecbbe',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_vbe_kernel_cta_per_row_1(const pta: gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#a9c866240eb5eb8df0da4e1ee803e04cf',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_vbe_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#a040a74b95b542902bfb38bacd03202eb',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_vbe_kernel_warp_per_row_1(const pta: gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#aa77ffcc8cedf9fe2668e96e9305bdccb',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_vbe_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#a02bd16452698dd0ae512e183e1ed25bb',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#aab426569c3d6a90703854ec88079c3cf',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#ab2b8f92ece6c5a09d11a65969626378d',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#ac60290f3d38a825226fe8014a9274e3d',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#a4ca2ae3bf6df90dd1f3a4bf8b534231e',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_weighted_vbe_kernel_cta_per_row_1(const pta: gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#ac83482e2c195bd6662609604217a4903',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_weighted_vbe_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#a412bd503e722e4451e55ef89a4bb3649',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_weighted_vbe_kernel_warp_per_row_1(const pta: gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#a5cbbacf7ec8ecfad9f032e7217474f71',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_weighted_vbe_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__split__grad_8cu.html#a2dd7fc517b5148ca80cff10cd7cbcaed',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_find_long_segments(const pta: gen_embedding_backward_split_grad.cu'],['../gen__embedding__backward__split__grad_8cu.html#aea453d06a5b06a7263bbb3c3c598b805',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_count_unique_indices_kernel(const pta: gen_embedding_backward_split_grad.cu'],['../gen__embedding__backward__split__grad_8cu.html#acfcb5a71381871c2d136a1e7ffc68b4c',1,'__launch_bounds__(kMaxThreads) void grad_mean_vbe_kernel(pta: gen_embedding_backward_split_grad.cu'],['../gen__embedding__backward__split__grad_8cu.html#a9cbee37a9474b3f03b3e585c448b63ee',1,'__launch_bounds__(kMaxThreads) void grad_mean_kernel(pta: gen_embedding_backward_split_grad.cu'],['../gen__embedding__backward__split__indice__weights__codegen__cuda_8cu.html#a422182213e14442c911aa3ba3ed18a58',1,'__launch_bounds__(kForwardMaxThreads) void split_embedding_codegen_grad_indice_weights_vbe_kernel(const pta: gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#ab27358be96fd39a3d879e0e3f942c616',1,'__launch_bounds__(kForwardMaxThreads) __global__ void dense_embedding_codegen_forward_unweighted_kernel(const pta: gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#a830a55ef37b6607a42e4b4cbb6889aa5',1,'__launch_bounds__(kForwardMaxThreads) __global__ void dense_embedding_codegen_forward_unweighted_kernel< uint8_t: gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#a0178272d43da8f09567a976c98e4617c',1,'__launch_bounds__(kForwardMaxThreads) __global__ void dense_embedding_nobag_codegen_forward_unweighted_kernel(const pta: gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#a13a4edf8545bd07a774fe7420e8d397b',1,'__launch_bounds__(kForwardMaxThreads) __global__ void dense_embedding_nobag_codegen_forward_unweighted_kernel< uint8_t: gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html#aa128173842fe96c64a581b2efdd5fe7e',1,'__launch_bounds__(kForwardMaxThreads) __global__ void dense_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta: gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html#a38384adec04c76c7f4267c8c1cdc7ff7',1,'__launch_bounds__(kForwardMaxThreads) __global__ void dense_embedding_nobag_codegen_forward_unweighted_small_kernel< uint8_t: gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#a40c420d5aadf8202b8a9de25931c44ff',1,'__launch_bounds__(kForwardMaxThreads) __global__ void dense_embedding_codegen_forward_weighted_kernel(const pta: gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#aa272d7ae5549cc1f16cb4761f3edf890',1,'__launch_bounds__(kForwardMaxThreads) __global__ void dense_embedding_codegen_forward_weighted_kernel< uint8_t: gen_embedding_forward_dense_weighted_kernel.cu'],['../namespacenbit.html#adf462393afe5c0c395c48cf4f889c6f8',1,'nbit::__launch_bounds__(WarpsPerBlock *kWarpSize) __global__ void FP16_split_embedding_codegen_forward_unweighted_kernel_small_L(const pta'],['../namespacenbit.html#aced6599a5180c2faaff5bbb9bc92f147',1,'nbit::__launch_bounds__(4 *kWarpSize) __global__ void FP16_split_embedding_codegen_forward_unweighted_kernel_small_L< int32_t'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#ab08dd38a042ee1b012a6db152e28df6d',1,'__launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_v2_kernel(const emb_t *__restrict__ const dev_weights: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a41deb3b48278a02504f49a2a3dc15cd8',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_kernel(const pta: gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a5ea0ab17f6d9eefd8f00e171c4d8b424',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_kernel< uint8_t: gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a9b1f7936d16c021a06b52e10047d17c9',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_kernel(const pta: gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#ae658cdd019bf968ffa65e519118af108',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_kernel< uint8_t: gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html#af345685cdddd68d8304b0804863bc611',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta: gen_embedding_forward_split_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html#a4c26c8149d8b4a96823082303a657531',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel< uint8_t: gen_embedding_forward_split_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a98033ae44aee4b9db7201fdad50c28db',1,'__launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_v2_kernel(const emb_t *__restrict__ const dev_weights: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a640269bb96d2014f8c117163f09d8228',1,'__launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_v2_kernel< uint8_t: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#a718566769c1ceda303b72d8876532ea6',1,'__launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_v2_kernel(const emb_t *__restrict__ const dev_weights: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a02d4931cef892bdaf44d3ab510f0d655',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_vbe_kernel(const pta: gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a95e359c3e33b1c2fcc6bb83a101c998f',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_vbe_kernel< uint8_t: gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#a993a3437f132715df009e8cdd7a12806',1,'__launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_v2_kernel(const emb_t *__restrict__ const dev_weights: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#a0f7cdacc2963885ca7eddcf74c44c1e7',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_kernel(const pta: gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#a183af91deddd1a5f4c5d1657476d2594',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_kernel< uint8_t: gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a993a3437f132715df009e8cdd7a12806',1,'__launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_v2_kernel(const emb_t *__restrict__ const dev_weights: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a7cf7d29de243a1d3d643b7f99420ca73',1,'__launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_v2_kernel< uint8_t: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#a993a3437f132715df009e8cdd7a12806',1,'__launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_v2_kernel(const emb_t *__restrict__ const dev_weights: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a2b31286ebfaa57f2a8e43418dc0cc2bc',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_vbe_kernel(const pta: gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a4e5e2097a867f5ac61d945360d16e1ed',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_vbe_kernel< uint8_t: gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#ab54a42bb86f9a913d382b4938e3b023f',1,'__launch_bounds__(kMaxThreads) void split_rowwise_adagrad_update_kernel(at: gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu'],['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#a34f0bcf2172442db1cd089b529e81d11',1,'__launch_bounds__(kMaxThreads) void split_rowwise_adagrad_update_kernel< uint8_t: gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu'],['../embedding__backward__split__grad__template_8cu.html#a2dd7fc517b5148ca80cff10cd7cbcaed',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_find_long_segments(const pta: embedding_backward_split_grad_template.cu'],['../embedding__backward__split__grad__template_8cu.html#aea453d06a5b06a7263bbb3c3c598b805',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_count_unique_indices_kernel(const pta: embedding_backward_split_grad_template.cu'],['../embedding__backward__split__kernel__cta__template_8cu.html#a436fa7b0b61202c628c4ca50bc9b1bcd',1,'__launch_bounds__(kMaxThreads) void: embedding_backward_split_kernel_cta_template.cu'],['../embedding__backward__split__kernel__warp__template_8cu.html#aa63bd2cb4cfc6b18191236e0a85bdd26',1,'__launch_bounds__(kBackwardMaxThreads) void: embedding_backward_split_kernel_warp_template.cu'],['../embedding__backward__split__template_8cu.html#a436fa7b0b61202c628c4ca50bc9b1bcd',1,'__launch_bounds__(kMaxThreads) void: embedding_backward_split_template.cu'],['../embedding__bounds__check_8cu.html#a9fcdcf37685cd2ec9b88dfac7e77aaaa',1,'__launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(const at: embedding_bounds_check.cu'],['../namespacenbit.html#a0a75b5eade7f9536629ce45b5827fb31',1,'nbit::__launch_bounds__()'],['../embedding__forward__split__kernel__nobag__small__template_8cu.html#a5c289e92014011ec16430dabf2272ae8',1,'__launch_bounds__(kForwardMaxThreads) __global__ void: embedding_forward_split_kernel_nobag_small_template.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a9bbd05d6885ea75e7564678a52104538',1,'__launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_: embedding_forward_split_kernel_v2_template.cu'],['../embedding__optimizer__split__kernel__template_8cu.html#a69cc59925f75e23b97fe9e48e72bb900',1,'__launch_bounds__(kMaxThreads) void split_: embedding_optimizer_split_kernel_template.cu'],['../embedding__optimizer__split__template_8cu.html#a69cc59925f75e23b97fe9e48e72bb900',1,'__launch_bounds__(kMaxThreads) void split_: embedding_optimizer_split_template.cu'],['../bench__utils_8cuh.html#a59e0073dcf6e90b2d7a7b38f6210cb50',1,'__launch_bounds__(kMaxThreads) void flush_gpu(char *d_flush: bench_utils.cuh'],['../namespacefbgemm__gpu.html#a17d5a2e40c83e6e3f5c68e375bf468f7',1,'fbgemm_gpu::__launch_bounds__(kMaxThreads) __global__ void embedding_inplace_update_kernel(at'],['../namespacefbgemm__gpu.html#ac93e7c311a1d26fbe8815c8b34a6bde4',1,'fbgemm_gpu::__launch_bounds__(kMaxThreads) void pruned_array_lookup_from_row_idx_kernel(const at'],['../namespacefbgemm__gpu.html#a50af77e9607a7a96addff8aa8e5e4508',1,'fbgemm_gpu::__launch_bounds__(kMaxThreads) void to_dense_segment_value_kernel(const int64_t num_lengths'],['../namespacefbgemm__gpu.html#a28846f89e09ae2fc064e73142d83ceef',1,'fbgemm_gpu::__launch_bounds__(kMaxThreads) void jagged_dense_bmm_kernel(const pta'],['../namespacefbgemm__gpu.html#ad21c70bdd84772ee2b9b3950c87e9791',1,'fbgemm_gpu::__launch_bounds__(kMaxThreads) void jagged_jagged_elementwise_dense_output_kernel_(const pta'],['../namespacefbgemm__gpu.html#afd2e24ffed8f057a2092d699b4cb3cb0',1,'fbgemm_gpu::__launch_bounds__(kMaxThreads) void jagged_index_add_2d_kernel(at'],['../namespacefbgemm__gpu.html#ac59415a66e49753fb42195f0d816c7c2',1,'fbgemm_gpu::__launch_bounds__(kMaxThreads) void _block_bucketize_sparse_features_cuda_kernel2(int lengths_size'],['../transpose__embedding__input_8cu.html#a91943a24b789081d81916b94ee7789ad',1,'__launch_bounds__(kMaxThreads) void linearize_index_kernel(const at: transpose_embedding_input.cu'],['../transpose__embedding__input_8cu.html#aee01a74e30c13b20ffba0c0737c44425',1,'__launch_bounds__(kMaxThreads) void linearize_index_index_select_kernel(const at: transpose_embedding_input.cu'],['../ssd__split__embeddings__cache__cuda_8cu.html#a7d15f4b6131224480844be177fe6b28d',1,'__launch_bounds__(kMaxThreads) void masked_index_put_kernel(at: ssd_split_embeddings_cache_cuda.cu'],['../ssd__split__embeddings__cache__cuda_8cu.html#aac79184e1b6e3d831580eba191b6da2e',1,'__launch_bounds__(kMaxThreads) void masked_index_put_kernel(at: ssd_split_embeddings_cache_cuda.cu']]], + ['_5fbfloat16_5fto_5ffloat_5fcpu_4',['_bfloat16_to_float_cpu',['../namespacefbgemm__gpu.html#ad8c67a657c3008d1d87472f216f7908f',1,'fbgemm_gpu']]], + ['_5fbfloat16_5fto_5ffloat_5fgpu_5',['_bfloat16_to_float_gpu',['../group__quantize-ops-cuda.html#ga2076a59fd190690f67c1eddb79b6acc4',1,'fbgemm_gpu']]], + ['_5fblock_5fbucketize_5fsparse_5ffeatures_5fcpu_6',['_block_bucketize_sparse_features_cpu',['../namespacefbgemm__gpu.html#adaf7cd0195ff361555f35a017c018d25',1,'fbgemm_gpu']]], + ['_5fbucketize_5fsparse_5ffeatures_5fcpu_7',['_bucketize_sparse_features_cpu',['../namespacefbgemm__gpu.html#a1f2b214db9aa3f8887c267c0ea9f5edf',1,'fbgemm_gpu']]], + ['_5fcat_5fint_5ftensors_8',['_cat_int_tensors',['../namespacefbgemm__gpu.html#acd8fa4397185c592f5eac101b42504a6',1,'fbgemm_gpu']]], + ['_5fcat_5fint_5ftensors_5fwith_5fpadding_9',['_cat_int_tensors_with_padding',['../namespacefbgemm__gpu.html#a1376d05f5d6efb4fbdb869e391702adf',1,'fbgemm_gpu']]], + ['_5fcat_5fper_5fsample_5fweights_5flist_10',['_cat_per_sample_weights_list',['../namespacefbgemm__gpu.html#a0eec17207e4a69da15dae845d02721e5',1,'fbgemm_gpu']]], + ['_5fdispatch_5femb_5fcache_5ftypes_11',['_DISPATCH_EMB_CACHE_TYPES',['../dispatch__macros_8h.html#a8a3aad8de22734b1397d813a855528e1',1,'dispatch_macros.h']]], + ['_5fexpand_5finto_5fjagged_5fpermute_5fcpu_5fkernel_12',['_expand_into_jagged_permute_cpu_kernel',['../namespacefbgemm__gpu.html#ac339123bb72d7421fca2d2b56821f02a',1,'fbgemm_gpu']]], + ['_5ffloat_5for_5fhalf_5fto_5ffusednbitrowwise_5fgpu_13',['_float_or_half_to_fusednbitrowwise_gpu',['../group__sparse-data-cuda.html#ga3b963d0e45c2bc0060aaa974efe64b8a',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5fbfloat16_5fcpu_14',['_float_to_bfloat16_cpu',['../namespacefbgemm__gpu.html#a51665269174ef625316e519465a67839',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5fbfloat16_5fgpu_15',['_float_to_bfloat16_gpu',['../group__quantize-ops-cuda.html#ga2f1cc4b6dc6f708324855f94d558cfc1',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5ffp8rowwise_5fgpu_16',['_float_to_FP8rowwise_gpu',['../group__quantize-ops-cuda.html#ga31b9029d43a60ad1fc90dc6ec54af9db',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5ffp8rowwise_5fgpu_5ft_17',['_float_to_FP8rowwise_gpu_t',['../namespacefbgemm__gpu.html#a6c5dca8da7ca5c5f89ecdc816745ba29',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5ffused8bitrowwise_5fcpu_5fout_18',['_float_to_fused8bitrowwise_cpu_out',['../group__quantize-data-cpu.html#gad38a9310258acccab8a017c1616034d0',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5ffused8bitrowwise_5fcpu_5fout_5ft_19',['_float_to_fused8bitrowwise_cpu_out_t',['../namespacefbgemm__gpu.html#a7f58b5ea1ea6cd38a42f73e5d688bb2c',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5ffused8bitrowwise_5fgpu_20',['_float_to_fused8bitrowwise_gpu',['../group__quantize-ops-cuda.html#ga8c11c8dc06cae57b3afba79358c00e99',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5ffused8bitrowwise_5fgpu_5ft_21',['_float_to_fused8bitrowwise_gpu_t',['../namespacefbgemm__gpu.html#a16bbb8557f4229489d966bb1d11bd00c',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5ffusednbitrowwise_5fcpu_22',['_float_to_fusednbitrowwise_cpu',['../namespacefbgemm__gpu.html#a29553ad77238659bb86c14842103d1d5',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5ffusednbitrowwise_5fgpu_23',['_float_to_fusednbitrowwise_gpu',['../group__quantize-ops-cuda.html#gaa3e8fd136e9bfa0e4d0c0016659bf708',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5ffusednbitrowwise_5fgpu_5ft_24',['_float_to_fusednbitrowwise_gpu_t',['../group__quantize-ops-cuda.html#ga02c8f9158646d9b16efbd3853711f56a',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5fhfp8_5fcpu_25',['_float_to_hfp8_cpu',['../namespacefbgemm__gpu.html#a70e9b9692aae9789f0a3804b9d12efe5',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5fhfp8_5fgpu_26',['_float_to_hfp8_gpu',['../group__quantize-ops-cuda.html#gab2837424e3774fe34ba255658554a75a',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5fmsfp_5fgpu_27',['_float_to_msfp_gpu',['../group__quantize-ops-cuda.html#ga427f81e1d8901e2fafc9611860fbd4d5',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5fpaddedfp8rowwise_5fgpu_28',['_float_to_paddedFP8rowwise_gpu',['../group__quantize-ops-cuda.html#ga5043927653e4d50462b79b7f3df33223',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5fpaddedfp8rowwise_5fgpu_5ft_29',['_float_to_paddedFP8rowwise_gpu_t',['../namespacefbgemm__gpu.html#a1d80140f030f2ca22fd14560e2d8aa42',1,'fbgemm_gpu']]], + ['_5ffp8rowwise_5fto_5ffloat_5fgpu_30',['_FP8rowwise_to_float_gpu',['../namespacefbgemm__gpu.html#a70d90c85fad4384b23c8958a6c300ce2',1,'fbgemm_gpu']]], + ['_5ffp8rowwise_5fto_5ffloat_5fgpu_5ft_31',['_FP8rowwise_to_float_gpu_t',['../namespacefbgemm__gpu.html#ac8931bd574641641dc69eadaae32efe3',1,'fbgemm_gpu']]], + ['_5ffused8bitrowwise_5fto_5ffloat_5fcpu_5fout_32',['_fused8bitrowwise_to_float_cpu_out',['../group__quantize-data-cpu.html#gabeb6675833a5b14e0a0d01385770a771',1,'fbgemm_gpu']]], + ['_5ffused8bitrowwise_5fto_5ffloat_5fcpu_5fout_5ft_33',['_fused8bitrowwise_to_float_cpu_out_t',['../namespacefbgemm__gpu.html#acc6b77e9be7ff8c2e5f16297fa6fad38',1,'fbgemm_gpu']]], + ['_5ffused8bitrowwise_5fto_5ffloat_5fgpu_34',['_fused8bitrowwise_to_float_gpu',['../namespacefbgemm__gpu.html#aab093a380068925d1b267452a1e255c2',1,'fbgemm_gpu']]], + ['_5ffused8bitrowwise_5fto_5ffloat_5fgpu_5ft_35',['_fused8bitrowwise_to_float_gpu_t',['../namespacefbgemm__gpu.html#a25d0793a9d1fe66bccad409791738b7b',1,'fbgemm_gpu']]], + ['_5ffused8bitrowwise_5fto_5ffloat_5fmixed_5fdim_5fgpu_36',['_fused8bitrowwise_to_float_mixed_dim_gpu',['../group__quantize-ops-cuda.html#ga4c2c033e940095d20e76e9e00fe925d3',1,'fbgemm_gpu']]], + ['_5ffused8bitrowwise_5fto_5fhalf_5fgpu_37',['_fused8bitrowwise_to_half_gpu',['../namespacefbgemm__gpu.html#a3aa2e594cf4bbb5cb5241c4eaa593f8a',1,'fbgemm_gpu']]], + ['_5ffused8bitrowwise_5fto_5fsingle_5for_5fhalf_5fprecision_5fgpu_38',['_fused8bitrowwise_to_single_or_half_precision_gpu',['../group__quantize-ops-cuda.html#gafacdb4ec7d8f5b969c75d2127537ab16',1,'fbgemm_gpu']]], + ['_5ffusednbitrowwise_5fto_5ffloat_5fcpu_39',['_fusednbitrowwise_to_float_cpu',['../namespacefbgemm__gpu.html#aa6141e72712885a0c89d74829be2fe6a',1,'fbgemm_gpu']]], + ['_5ffusednbitrowwise_5fto_5ffloat_5fgpu_40',['_fusednbitrowwise_to_float_gpu',['../namespacefbgemm__gpu.html#ae0193dd7bbb4e72fc977330cc3f019a4',1,'fbgemm_gpu']]], + ['_5ffusednbitrowwise_5fto_5ffloat_5fgpu_5ft_41',['_fusednbitrowwise_to_float_gpu_t',['../group__quantize-ops-cuda.html#gae1e827b74f0825dc4135e68c10e443b3',1,'fbgemm_gpu']]], + ['_5ffusednbitrowwise_5fto_5ffloat_5for_5fhalf_5fgpu_42',['_fusednbitrowwise_to_float_or_half_gpu',['../group__quantize-ops-cuda.html#ga07f4c02c95710472b815bdc1d7bfff19',1,'fbgemm_gpu']]], + ['_5ffusednbitrowwise_5fto_5fhalf_5fgpu_43',['_fusednbitrowwise_to_half_gpu',['../group__quantize-ops-cuda.html#ga6152517943258bd3adc42b7c103a9277',1,'fbgemm_gpu']]], + ['_5fgeneric_5fhistogram_5fbinning_5fcalibration_5fby_5ffeature_5fcpu_5fkernel_44',['_generic_histogram_binning_calibration_by_feature_cpu_kernel',['../namespacefbgemm__gpu.html#accd75a24d809f4322a18bfb12f47b343',1,'fbgemm_gpu']]], + ['_5fhalf_5fto_5ffused8bitrowwise_5fcpu_5fout_45',['_half_to_fused8bitrowwise_cpu_out',['../namespacefbgemm__gpu.html#a23bfcbc4afa5dd7d35ee03b7f23840a9',1,'fbgemm_gpu']]], + ['_5fhalf_5fto_5ffused8bitrowwise_5fgpu_46',['_half_to_fused8bitrowwise_gpu',['../namespacefbgemm__gpu.html#adfeb2fc956b7aa5c2446a00ccbcd058e',1,'fbgemm_gpu']]], + ['_5fhalf_5fto_5ffusednbitrowwise_5fgpu_47',['_half_to_fusednbitrowwise_gpu',['../group__quantize-ops-cuda.html#ga6e2bd64f3f9e3b36493ec955680771af',1,'fbgemm_gpu']]], + ['_5fhfp8_5fto_5ffloat_5fcpu_48',['_hfp8_to_float_cpu',['../namespacefbgemm__gpu.html#aaa8438f606e84d5cb07827759163bec6',1,'fbgemm_gpu']]], + ['_5fhfp8_5fto_5ffloat_5fgpu_49',['_hfp8_to_float_gpu',['../group__quantize-ops-cuda.html#ga03a8f8825a16c6235b699886fa46e1f6',1,'fbgemm_gpu']]], + ['_5fhistogram_5fbinning_5fcalibration_5fby_5ffeature_5fcpu_5fkernel_50',['_histogram_binning_calibration_by_feature_cpu_kernel',['../namespacefbgemm__gpu.html#adce89aa38a4a22058ec42b5077bbe23a',1,'fbgemm_gpu']]], + ['_5fhistogram_5fbinning_5fcalibration_5fcpu_5fkernel_51',['_histogram_binning_calibration_cpu_kernel',['../namespacefbgemm__gpu.html#a7639f61a587aa5052c488fbd00d3784b',1,'fbgemm_gpu']]], + ['_5finvert_5fpermute_5fcpu_5fkernel_52',['_invert_permute_cpu_kernel',['../namespacefbgemm__gpu.html#a7a8e9e91365de25b995833c08eb32eff',1,'fbgemm_gpu']]], + ['_5fmsfp_5fto_5ffloat_5fgpu_53',['_msfp_to_float_gpu',['../group__quantize-ops-cuda.html#gac0c20377454dbfafcc5ac245fe6427ce',1,'fbgemm_gpu']]], + ['_5fpaddedfp8rowwise_5fto_5ffloat_5fgpu_54',['_paddedFP8rowwise_to_float_gpu',['../namespacefbgemm__gpu.html#afc30bb56977528d8a85e43f9aa5c2cf8',1,'fbgemm_gpu']]], + ['_5fpaddedfp8rowwise_5fto_5ffloat_5fgpu_5ft_55',['_paddedFP8rowwise_to_float_gpu_t',['../namespacefbgemm__gpu.html#a0c0b93e239757d9564c51f8922f17554',1,'fbgemm_gpu']]], + ['_5fpermute_5f1d_5findices_5fweights_5fkernel_5fcpu_56',['_permute_1D_indices_weights_kernel_cpu',['../namespacefbgemm__gpu.html#af0e07ade6f2b89bf71c344aac8106b59',1,'fbgemm_gpu']]], + ['_5fpermute_5f1d_5flengths_5fcpu_5fkernel_57',['_permute_1D_lengths_cpu_kernel',['../namespacefbgemm__gpu.html#a8dfcdb2c902cf1c4e5d0ed916d5fe779',1,'fbgemm_gpu']]], + ['_5fpermute_5f2d_5findices_5fweights_5fkernel_5fcpu_58',['_permute_2D_indices_weights_kernel_cpu',['../namespacefbgemm__gpu.html#acad68edeefe7a7710f729cdc56876851',1,'fbgemm_gpu']]], + ['_5fpermute_5f2d_5flengths_5fcpu_5fkernel_59',['_permute_2D_lengths_cpu_kernel',['../namespacefbgemm__gpu.html#a72c447e3b6d38b548d89ebc464e2d469',1,'fbgemm_gpu']]], + ['_5fpermute_5fdata_5fkernel_5fcpu_60',['_permute_data_kernel_cpu',['../namespacefbgemm__gpu.html#a2fb715b347e075f3331083905cdaadfb',1,'fbgemm_gpu']]], + ['_5fpermute_5fembeddings_5fkernel_5fcpu_61',['_permute_embeddings_kernel_cpu',['../namespacefbgemm__gpu.html#a6987e1403a25c256168873616dffbdf6',1,'fbgemm_gpu']]], + ['_5fpermute_5flengths_5fcpu_5fkernel_62',['_permute_lengths_cpu_kernel',['../namespacefbgemm__gpu.html#a4c7749afd2c661b1d302268035fde42b',1,'fbgemm_gpu']]], + ['_5fsegment_5fsum_5fcsr_5fcpu_5fkernel_63',['_segment_sum_csr_cpu_kernel',['../namespacefbgemm__gpu.html#ade08c8b174b0ecbb99d01ad87b4da0b3',1,'fbgemm_gpu']]], + ['_5fsingle_5for_5fhalf_5fprecision_5fto_5ffused8bitrowwise_5fgpu_64',['_single_or_half_precision_to_fused8bitrowwise_gpu',['../group__quantize-ops-cuda.html#gaff285349cb9c51a56fc418b628772b16',1,'fbgemm_gpu']]], + ['_5fupdate_5fkernel_65',['_update_kernel',['../embedding__optimizer__split__kernel__template_8cu.html#afab484072b9b8381500b14e31ba49364',1,'_update_kernel(at::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > dev_weights, at::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > uvm_weights, at::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const at::PackedTensorAccessor32< emb_t, 1, at::RestrictPtrTraits > grad_dev_weights, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_dev_indices, const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const int32_t max_D, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, {{ args.split_kernel_args|join(", ") }}): embedding_optimizer_split_kernel_template.cu'],['../embedding__optimizer__split__template_8cu.html#afab484072b9b8381500b14e31ba49364',1,'_update_kernel(at::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > dev_weights, at::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > uvm_weights, at::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const at::PackedTensorAccessor32< emb_t, 1, at::RestrictPtrTraits > grad_dev_weights, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_dev_indices, const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const int32_t max_D, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, {{ args.split_kernel_args|join(", ") }}): embedding_optimizer_split_kernel_template.cu']]], + ['_5fv2_5fkernel_66',['_v2_kernel',['../embedding__forward__split__kernel__v2__template_8cu.html#a20b736346ad19821ed9748c4dde5b058',1,'embedding_forward_split_kernel_v2_template.cu']]] ]; diff --git a/search/all_1.js b/search/all_1.js index e01359b9b..8c39e37f0 100644 --- a/search/all_1.js +++ b/search/all_1.js @@ -1,11 +1,39 @@ var searchData= [ - ['combine_20input_20operators_0',['Combine Input Operators',['../group__input-combine.html',1,'']]], - ['comparator_1',['Comparator',['../structfbgemm__gpu_1_1_comparator.html',1,'fbgemm_gpu']]], - ['cpu_20operators_2',['cpu operators',['../group__embedding-cpu.html',1,'Embedding CPU Operators'],['../group__layout-transform-cpu.html',1,'Layout Transformation CPU Operators'],['../group__quantize-data-cpu.html',1,'Quantize Data CPU Operators'],['../group__sparse-data-cpu.html',1,'Sparse Data CPU Operators']]], - ['cpu_20permutation_20operators_3',['CPU Permutation Operators',['../group__permute-pooled-embs-cpu.html',1,'']]], - ['cuda_4',['Quantization Operators for CUDA',['../group__quantize-ops-cuda.html',1,'']]], - ['cuda_20memorty_20operators_5',['CUDA Memorty Operators',['../group__cumem-utils.html',1,'']]], - ['cuda_20operators_6',['cuda operators',['../group__table-batched-embed-cuda.html',1,'CUDA Operators'],['../group__embedding-cuda.html',1,'Embedding CUDA Operators'],['../group__jagged-tensor-ops-cuda.html',1,'Jagged Tensor CUDA Operators'],['../group__layout-transform-cuda.html',1,'Layout Transformation CUDA Operators'],['../group__sparse-data-cuda.html',1,'Sparse Data CUDA Operators']]], - ['cuda_20permutation_20operators_7',['CUDA Permutation Operators',['../group__permute-pooled-embs-gpu.html',1,'']]] + ['a_0',['a',['../structfbgemm__gpu_1_1_half4.html#a27075551b75deec4b6f30d368075d852',1,'fbgemm_gpu::Half4::a'],['../structfbgemm__gpu_1_1_stochastic_rounding_r_n_g_state.html#a943da41846f7804fa8edd8b012551545',1,'fbgemm_gpu::StochasticRoundingRNGState::a']]], + ['acc_1',['acc',['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#af700a6ab2d3b94e74f6a387b1adebdc6',1,'fbgemm_gpu::Vec4T< float >::acc'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#af700a6ab2d3b94e74f6a387b1adebdc6',1,'fbgemm_gpu::Vec4T< at::Half >::acc'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#af700a6ab2d3b94e74f6a387b1adebdc6',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::acc'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#accb1990c79cc9a9c3ca84d635d589ca4',1,'fbgemm_gpu::Vec4T< double >::acc'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#ae7a59aea3ae02e7c3c40b93e77208b3a',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::acc'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#af835160660d81c33fb2f1f42017452fb',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::acc'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#af700a6ab2d3b94e74f6a387b1adebdc6',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::acc'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#af700a6ab2d3b94e74f6a387b1adebdc6',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::acc'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#a17543b514b8298a1e94b5671db506366',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::acc'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#a1f332e6824c0bf94b367c027c6c91595',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::acc'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#a019a15988d03cdc6474def4b35e32345',1,'fbgemm_gpu::Vec4AccT::acc']]], + ['acc_5fadd_5for_5ffma_2',['ACC_ADD_OR_FMA',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#ada15471a8b1da6a3a43b940916fea71e',1,'ACC_ADD_OR_FMA: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ada15471a8b1da6a3a43b940916fea71e',1,'ACC_ADD_OR_FMA: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#ada15471a8b1da6a3a43b940916fea71e',1,'ACC_ADD_OR_FMA: embedding_forward_split_kernel_v2_template.cu']]], + ['accumulate_5ffp16_3',['accumulate_fp16',['../namespacefbgemm__gpu.html#a3de0ed0985acc3edc0583b6cd56a43f2',1,'fbgemm_gpu']]], + ['accumulate_5ffp32_4',['accumulate_fp32',['../namespacefbgemm__gpu.html#aeb3ef6437b744f52b29910361f83336c',1,'fbgemm_gpu']]], + ['accumulate_5fpacked_5fhfp8_5',['accumulate_packed_hfp8',['../namespacefbgemm__gpu.html#acc596fdaac7efc925d19d7374251e8cb',1,'fbgemm_gpu']]], + ['accumulate_5fpacked_5fint2_6',['accumulate_packed_int2',['../namespacefbgemm__gpu.html#a857c58d8bfc412a3901414ef0b0f73c5',1,'fbgemm_gpu']]], + ['accumulate_5fpacked_5fint4_7',['accumulate_packed_int4',['../namespacefbgemm__gpu.html#af3478ab6f636e80a75953ffc1d8caed9',1,'fbgemm_gpu']]], + ['accumulate_5fpacked_5fint8_8',['accumulate_packed_int8',['../namespacefbgemm__gpu.html#a24c22ef27a441cb888d3b32957588794',1,'fbgemm_gpu']]], + ['accumulate_5fweighted_5ffp16_9',['accumulate_weighted_fp16',['../namespacefbgemm__gpu.html#a2700bcf99c82f2491a174d51c462e4e8',1,'fbgemm_gpu']]], + ['accumulate_5fweighted_5ffp32_10',['accumulate_weighted_fp32',['../namespacefbgemm__gpu.html#a7225f36d3ef25f69273160500bd0b9a7',1,'fbgemm_gpu']]], + ['accumulate_5fweighted_5fpacked_5fhfp8_11',['accumulate_weighted_packed_hfp8',['../namespacefbgemm__gpu.html#aa177a98d987438afcde04f7fc2cba71a',1,'fbgemm_gpu']]], + ['accumulate_5fweighted_5fpacked_5fint2_12',['accumulate_weighted_packed_int2',['../namespacefbgemm__gpu.html#aebe17b37f24d82ea8cfbd296e307d5ab',1,'fbgemm_gpu']]], + ['accumulate_5fweighted_5fpacked_5fint4_13',['accumulate_weighted_packed_int4',['../namespacefbgemm__gpu.html#ade03f1b4099c9ecaf38d7d6a0eb7d595',1,'fbgemm_gpu']]], + ['accumulate_5fweighted_5fpacked_5fint8_14',['accumulate_weighted_packed_int8',['../namespacefbgemm__gpu.html#a80d2d456b1c87f68c9098d5e5d1fd47d',1,'fbgemm_gpu']]], + ['add_15',['add',['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#a3421b900475f40701fb4c0c1c542744c',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::add()'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a5686a6ec8884ddf2ad633d735d181011',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::add()'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#ac26f750f3fa72d8b137026cc8726972f',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::add()'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#a4909df6d879ffbb0e234114609ce3000',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::add()'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#a4909df6d879ffbb0e234114609ce3000',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::add()'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#a4909df6d879ffbb0e234114609ce3000',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::add()'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#a827812cf195008164049b47d4fc9efc1',1,'fbgemm_gpu::Vec4AccT::add(const float4 *ptr)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#a533e0b7fe298fd776f58607d9f67bda1',1,'fbgemm_gpu::Vec4AccT::add(const float2 *ptr)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#a14f0714a4e51293efb99e3d6815be3a2',1,'fbgemm_gpu::Vec4AccT::add(const uint8_t *ptr)']]], + ['add_5f_16',['add_',['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#af3cbc396133203521c050935239eebe2',1,'fbgemm_gpu::Vec4T< float >::add_()'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#af3cbc396133203521c050935239eebe2',1,'fbgemm_gpu::Vec4T< at::Half >::add_(const Vec4T< float > &a)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a3f8a7e8e00c59205f3b32b345290922b',1,'fbgemm_gpu::Vec4T< at::Half >::add_(const Vec4T< at::Half > &a)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#af3cbc396133203521c050935239eebe2',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::add_(const Vec4T< float > &a)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a3f8a7e8e00c59205f3b32b345290922b',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::add_(const Vec4T< at::Half > &a)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#ae0cdda7691531bfb7975dad742ff3984',1,'fbgemm_gpu::Vec4T< double >::add_()'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#a746ed2bbabd0878f33b478c587bde0cf',1,'fbgemm_gpu::Vec4AccT::add_(const float *vals)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#a5eebdd38332484343d4400fd08f3b549',1,'fbgemm_gpu::Vec4AccT::add_(const half2 *vals_h)']]], + ['adjacencymatrix_17',['AdjacencyMatrix',['../topology__utils_8h.html#ada7183ec06808ddb73d8f1a65cd8f7ae',1,'topology_utils.h']]], + ['adjust_5finfo_5fb_5fnum_5fbits_18',['adjust_info_B_num_bits',['../split__embeddings__utils_8cuh.html#aaaa05e63829893f17b951de7dc993747',1,'adjust_info_B_num_bits(int32_t B, int32_t T): get_infos_metadata.cu'],['../get__infos__metadata_8cu.html#a315ee6fa620a68c902298d741ac8989d',1,'adjust_info_B_num_bits(int32_t B, int32_t T): get_infos_metadata.cu']]], + ['adjust_5foffset_5fkernel_19',['adjust_offset_kernel',['../embedding__bounds__check_8cu.html#af9e26c2f2d6dfef45e1a12507d8c2b72',1,'embedding_bounds_check.cu']]], + ['all_5fto_5fone_5fdevice_20',['all_to_one_device',['../group__merge-pooled-emb.html#ga3933c7465129b58edd60ffcc1999c223',1,'fbgemm_gpu']]], + ['architecture_5fid_21',['ARCHITECTURE_ID',['../_c_make_c_compiler_id_8c.html#aba35d0d200deaeb06aee95ca297acb28',1,'ARCHITECTURE_ID: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#aba35d0d200deaeb06aee95ca297acb28',1,'ARCHITECTURE_ID: CMakeCXXCompilerId.cpp']]], + ['args_5fpos_22',['args_pos',['../namespacefbgemm__gpu.html#afcbf1cd70ce8ea074c2e799d1559b396',1,'fbgemm_gpu']]], + ['assign_23',['assign',['../namespacefbgemm__gpu.html#a6e69d027d43eb7e92ea620d43ae43cb1',1,'fbgemm_gpu']]], + ['asynchronous_5fcomplete_5fcumsum_24',['asynchronous_complete_cumsum',['../transpose__embedding__input_8cu.html#ae27e2b1fda2a338ce8f7f2207b580e7f',1,'transpose_embedding_input.cu']]], + ['asynchronous_5fcomplete_5fcumsum_5fcpu_25',['asynchronous_complete_cumsum_cpu',['../namespacefbgemm__gpu.html#a98effac974dc3fe5bbcc4ce8a75578f7',1,'fbgemm_gpu']]], + ['asynchronous_5fcomplete_5fcumsum_5fgpu_26',['asynchronous_complete_cumsum_gpu',['../namespacefbgemm__gpu.html#a1f31ee9922c98ad5d013361368f2f5ac',1,'fbgemm_gpu']]], + ['asynchronous_5fcomplete_5fcumsum_5fmeta_27',['asynchronous_complete_cumsum_meta',['../namespacefbgemm__gpu.html#a656bb5222f2a0bc92d5b895ba0fa846c',1,'fbgemm_gpu']]], + ['asynchronous_5fexclusive_5fcumsum_5fcpu_28',['asynchronous_exclusive_cumsum_cpu',['../namespacefbgemm__gpu.html#a69fe5be794026bdb73b0196be9b345a4',1,'fbgemm_gpu']]], + ['asynchronous_5fexclusive_5fcumsum_5fgpu_29',['asynchronous_exclusive_cumsum_gpu',['../namespacefbgemm__gpu.html#afd8b0919b5b3b021a8eb3727e304d5b4',1,'fbgemm_gpu']]], + ['asynchronous_5fexclusive_5fcumsum_5fmeta_30',['asynchronous_exclusive_cumsum_meta',['../namespacefbgemm__gpu.html#ae96f1ffdb8ed1efd58561364fbaf3c6a',1,'fbgemm_gpu']]], + ['asynchronous_5finclusive_5fcumsum_5fcpu_31',['asynchronous_inclusive_cumsum_cpu',['../namespacefbgemm__gpu.html#a8930419ab36c85750182c12db95baa29',1,'fbgemm_gpu']]], + ['asynchronous_5finclusive_5fcumsum_5fgpu_32',['asynchronous_inclusive_cumsum_gpu',['../namespacefbgemm__gpu.html#acc0c0e7f6e816900474b2e52756ac891',1,'fbgemm_gpu']]], + ['at_33',['at',['../classfbgemm__gpu_1_1_tensor_accessor_base.html#a95ed732ddbdd788721e2c0fc17a3d8a0',1,'fbgemm_gpu::TensorAccessorBase::at()'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#a95ed732ddbdd788721e2c0fc17a3d8a0',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::at()']]], + ['at_5fx_34',['AT_X',['../fbgemm__tensor__accessor_8h.html#ac7d28de6473a715c6228c08b391476bb',1,'fbgemm_tensor_accessor.h']]], + ['auc_5fkernel_35',['auc_kernel',['../namespacefbgemm__gpu.html#a4bcadae3f465ece7979bf89f0c1cf22a',1,'fbgemm_gpu']]] ]; diff --git a/search/all_10.js b/search/all_10.js index 7146073cb..201111c89 100644 --- a/search/all_10.js +++ b/search/all_10.js @@ -1,7 +1,120 @@ var searchData= [ - ['tensor_20cuda_20operators_0',['Jagged Tensor CUDA Operators',['../group__jagged-tensor-ops-cuda.html',1,'']]], - ['tensor_20operators_1',['Jagged Tensor Operators',['../group__jagged-tensor-ops-cpu.html',1,'']]], - ['transformation_20cpu_20operators_2',['Layout Transformation CPU Operators',['../group__layout-transform-cpu.html',1,'']]], - ['transformation_20cuda_20operators_3',['Layout Transformation CUDA Operators',['../group__layout-transform-cuda.html',1,'']]] + ['p_5findex_5fweights_0',['P_index_weights',['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54ae6fddad64ad96f09ab2bf8e417dcab18',1,'gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['p_5findices_1',['P_indices',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a4b8443d24ef0d6d8b29d1de191b5fa20',1,'P_indices: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a4b8443d24ef0d6d8b29d1de191b5fa20',1,'P_indices: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a4b8443d24ef0d6d8b29d1de191b5fa20',1,'P_indices: embedding_forward_split_kernel_v2_template.cu']]], + ['p_5findices_5fis_5flong_2',['P_indices_is_long',['../namespacefbgemm__gpu.html#afcbf1cd70ce8ea074c2e799d1559b396ac640586328f5125ff8881c6b93fac125',1,'fbgemm_gpu']]], + ['p_5findices_5foffsets_3',['P_indices_offsets',['../namespacefbgemm__gpu.html#afcbf1cd70ce8ea074c2e799d1559b396a5f3a87c5dbebfaefd128c19ebbe6c7de',1,'fbgemm_gpu']]], + ['p_5findices_5fprts_4',['P_indices_prts',['../namespacefbgemm__gpu.html#afcbf1cd70ce8ea074c2e799d1559b396a8ae3847f58b98ba0ff4b0fcdfb4ae8e6',1,'fbgemm_gpu']]], + ['p_5flengths_5faddrs_5',['P_lengths_addrs',['../namespacefbgemm__gpu.html#afcbf1cd70ce8ea074c2e799d1559b396a66aa4e0ec73344232b5d56ee78ef17b0',1,'fbgemm_gpu']]], + ['p_5flengths_5fis_5flong_6',['P_lengths_is_long',['../namespacefbgemm__gpu.html#afcbf1cd70ce8ea074c2e799d1559b396a1c841401de519f97ca671d064c22250e',1,'fbgemm_gpu']]], + ['p_5flengths_5foffsets_7',['P_lengths_offsets',['../namespacefbgemm__gpu.html#afcbf1cd70ce8ea074c2e799d1559b396ad300b64361a3f3e756bfa78fd0b23b97',1,'fbgemm_gpu']]], + ['p_5fload_5fd_8',['P_load_D',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a8cfa61b080ef7d26fbe3b8d150b04834',1,'P_load_D: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a8cfa61b080ef7d26fbe3b8d150b04834',1,'P_load_D: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a8cfa61b080ef7d26fbe3b8d150b04834',1,'P_load_D: embedding_forward_split_kernel_v2_template.cu']]], + ['p_5flxu_5fcache_5flocations_9',['P_lxu_cache_locations',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#ad863bc0421e111195e2ac11c7ad2071da9e6d36a61249ee13ac61fee16a76d83c',1,'P_lxu_cache_locations: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ad863bc0421e111195e2ac11c7ad2071da9e6d36a61249ee13ac61fee16a76d83c',1,'P_lxu_cache_locations: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['p_5flxu_5fcache_5fweights_10',['P_lxu_cache_weights',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#ad863bc0421e111195e2ac11c7ad2071daf09c8e1f82af5f3e97070537dec964e0',1,'P_lxu_cache_weights: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ad863bc0421e111195e2ac11c7ad2071daf09c8e1f82af5f3e97070537dec964e0',1,'P_lxu_cache_weights: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['p_5fnum_5foffsets_11',['P_num_offsets',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54aecf1052c404b0ca815cb290cb8854144',1,'P_num_offsets: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54aecf1052c404b0ca815cb290cb8854144',1,'P_num_offsets: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54aecf1052c404b0ca815cb290cb8854144',1,'P_num_offsets: embedding_forward_split_kernel_v2_template.cu']]], + ['p_5foffsets_12',['P_offsets',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a7fcce188570ec66dece71f0da186e029',1,'P_offsets: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a7fcce188570ec66dece71f0da186e029',1,'P_offsets: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['p_5foutputs_13',['P_outputs',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a00d2586446417c7ba88c313f0901f3da',1,'P_outputs: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a00d2586446417c7ba88c313f0901f3da',1,'P_outputs: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a00d2586446417c7ba88c313f0901f3da',1,'P_outputs: embedding_forward_split_kernel_v2_template.cu']]], + ['p_5fper_5fsample_5fweight_14',['P_per_sample_weight',['../namespacefbgemm__gpu.html#afcbf1cd70ce8ea074c2e799d1559b396ae38edd0733e3ec3ca85cfa8bd9b8ac93',1,'fbgemm_gpu']]], + ['p_5ftotal_5fload_5fd_15',['P_total_load_D',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a26b29347efcf14fcee3eef781e755ea2',1,'P_total_load_D: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a26b29347efcf14fcee3eef781e755ea2',1,'P_total_load_D: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a26b29347efcf14fcee3eef781e755ea2',1,'P_total_load_D: embedding_forward_split_kernel_v2_template.cu']]], + ['p_5fweights_16',['P_weights',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54aeea99260eda72bf0110c9b54f0ebcb52',1,'P_weights: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54aeea99260eda72bf0110c9b54f0ebcb52',1,'P_weights: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54aeea99260eda72bf0110c9b54f0ebcb52',1,'P_weights: embedding_forward_split_kernel_v2_template.cu']]], + ['pack_5fsegments_5fautograd_17',['pack_segments_autograd',['../namespacefbgemm__gpu.html#a24fd2f4efa543ea716010c3fc1832587',1,'fbgemm_gpu']]], + ['pack_5fsegments_5fbackward_5fcpu_18',['pack_segments_backward_cpu',['../namespacefbgemm__gpu.html#a51f0921a8e934c6c4d0fca5ebb5d8338',1,'fbgemm_gpu']]], + ['pack_5fsegments_5fbackward_5fcuda_19',['pack_segments_backward_cuda',['../namespacefbgemm__gpu.html#aaded8e25bef3a32580d71dc2ead25f0c',1,'fbgemm_gpu']]], + ['pack_5fsegments_5fcpu_20',['pack_segments_cpu',['../namespacefbgemm__gpu.html#a01151883c1840f280f4f9c083677c8b5',1,'fbgemm_gpu']]], + ['pack_5fsegments_5fcuda_21',['pack_segments_cuda',['../namespacefbgemm__gpu.html#a049c248a78797b27f5e053809c13b88e',1,'fbgemm_gpu']]], + ['pack_5fsegments_5fcuda_5fkernel_22',['pack_segments_cuda_kernel',['../namespacefbgemm__gpu.html#a3ff1eed5a38a10b4da916f9ec154f225',1,'fbgemm_gpu']]], + ['pack_5fsegments_5fforward_5fcpu_23',['pack_segments_forward_cpu',['../namespacefbgemm__gpu.html#a49cb5dd543cc63e932f458e1c79c0d00',1,'fbgemm_gpu']]], + ['pack_5fsegments_5fforward_5fcuda_24',['pack_segments_forward_cuda',['../namespacefbgemm__gpu.html#a4bec138cb5be2583288d026eb4185646',1,'fbgemm_gpu']]], + ['packedtensoraccessor32_25',['PackedTensorAccessor32',['../namespacefbgemm__gpu.html#a64ee5a7e6df3a95f1d4bdd9f38707c96',1,'fbgemm_gpu']]], + ['packedtensoraccessor64_26',['PackedTensorAccessor64',['../namespacefbgemm__gpu.html#a69b304f75455a9eb7144259c09770877',1,'fbgemm_gpu']]], + ['padded_5fd_27',['padded_D',['../namespacenbit.html#a45a36e2eb0376c3e37728ea312851cd7',1,'nbit']]], + ['padded_5frow_5fsize_5fin_5fbytes_28',['padded_row_size_in_bytes',['../namespacenbit.html#a3ac5bf25115544f9067032bef644a215',1,'nbit']]], + ['padding_5ffused_5ftbe_5finput_5fcombine_5fcpu_29',['padding_fused_tbe_input_combine_cpu',['../group__input-combine.html#ga9ab60fbe75053c2f31f7d3f16dfa476f',1,'fbgemm_gpu']]], + ['padding_5ffused_5ftbe_5finput_5fcombine_5fwith_5flength_5fcpu_30',['padding_fused_tbe_input_combine_with_length_cpu',['../namespacefbgemm__gpu.html#af01b4023830652f0cc3e99c87f7b4526',1,'fbgemm_gpu']]], + ['params_5foffset_31',['params_offset',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a02bec57c3d9431edc5aba7767412fada',1,'params_offset: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a02bec57c3d9431edc5aba7767412fada',1,'params_offset: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['per_5fsample_5fweights_5faddrs_32',['per_sample_weights_addrs',['../namespacefbgemm__gpu.html#a34e6956031d1fc5c0f8df5fb432bcfbd',1,'fbgemm_gpu']]], + ['permute_33',['permute',['../namespacefbgemm__gpu.html#a313d400789ec7e8bf0702c1d06339394',1,'fbgemm_gpu']]], + ['permute_20pooled_20embeddings_20operators_20cpu_34',['Permute Pooled Embeddings Operators (CPU)',['../group__permute-pooled-embs-cpu.html',1,'']]], + ['permute_20pooled_20embeddings_20operators_20cuda_35',['Permute Pooled Embeddings Operators (CUDA)',['../group__permute-pooled-embs-gpu.html',1,'']]], + ['permute102_5fbaddbmm_5fpermute102_5fcpu_36',['permute102_baddbmm_permute102_cpu',['../namespacefbgemm__gpu.html#ab8d862f0ffee51a4d276f3989f0ab24b',1,'fbgemm_gpu']]], + ['permute102_5fbaddbmm_5fpermute102_5fcuda_37',['permute102_baddbmm_permute102_cuda',['../namespacefbgemm__gpu.html#a0c3f53164eb98c0b45b5aaef3e99a172',1,'fbgemm_gpu']]], + ['permute_5f1d_5fsparse_5fdata_5fcpu_38',['permute_1D_sparse_data_cpu',['../namespacefbgemm__gpu.html#a22758d46158e49801e876ab269855736',1,'fbgemm_gpu']]], + ['permute_5f2d_5fsparse_5fdata_5fcpu_39',['permute_2D_sparse_data_cpu',['../namespacefbgemm__gpu.html#a83da584464d49a223941e4b926b9676a',1,'fbgemm_gpu']]], + ['permute_5fduplicate_5fpooled_5fembs_5fauto_5fgrad_5fcpu_40',['permute_duplicate_pooled_embs_auto_grad_cpu',['../namespacefbgemm__gpu.html#aeabdb24bef8b30a2b80b94a676b2b5fb',1,'fbgemm_gpu']]], + ['permute_5fduplicate_5fpooled_5fembs_5fauto_5fgrad_5fgpu_41',['permute_duplicate_pooled_embs_auto_grad_gpu',['../namespacefbgemm__gpu.html#a242a088c94da1f0b016087bef8460622',1,'fbgemm_gpu']]], + ['permute_5fduplicate_5fpooled_5fembs_5fauto_5fgrad_5fsplit_5fcpu_42',['permute_duplicate_pooled_embs_auto_grad_split_cpu',['../namespacefbgemm__gpu.html#af0cdb20f76a1c62644ad644e4c7210ad',1,'fbgemm_gpu']]], + ['permute_5fduplicate_5fpooled_5fembs_5fauto_5fgrad_5fsplit_5fgpu_43',['permute_duplicate_pooled_embs_auto_grad_split_gpu',['../namespacefbgemm__gpu.html#a276c76fa5487668edb8477a844ca1704',1,'fbgemm_gpu']]], + ['permute_5fduplicate_5fpooled_5fembs_5fcpu_44',['permute_duplicate_pooled_embs_cpu',['../namespacefbgemm__gpu.html#acc5af8d2639bda183a7758a7fb4d4e9a',1,'fbgemm_gpu']]], + ['permute_5fduplicate_5fpooled_5fembs_5fgpu_45',['permute_duplicate_pooled_embs_gpu',['../namespacefbgemm__gpu.html#aecf7e9c2b36bb349c98294b9abfcf7c1',1,'fbgemm_gpu']]], + ['permute_5fduplicate_5fpooled_5fembs_5fsplit_5fcpu_46',['permute_duplicate_pooled_embs_split_cpu',['../namespacefbgemm__gpu.html#a286571e933b530189672faaa53ee20e6',1,'fbgemm_gpu']]], + ['permute_5fduplicate_5fpooled_5fembs_5fsplit_5fgpu_47',['permute_duplicate_pooled_embs_split_gpu',['../namespacefbgemm__gpu.html#a34e792da7d58bd96fc1c9d4c0b1b3a2a',1,'fbgemm_gpu']]], + ['permute_5fembeddings_5fkernel_48',['permute_embeddings_kernel',['../namespacefbgemm__gpu.html#a2b00efff9050b6bec363081afc5c3c2f',1,'fbgemm_gpu']]], + ['permute_5foutput_5fdim_5f0_5f1_49',['permute_output_dim_0_1',['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html#a5bd1879ce15d52289f55eb10253c8e8e',1,'gen_batch_index_select_dim0_forward_kernel_small.cu']]], + ['permute_5fpooled_5fembedding_5ffunction_2ecpp_50',['permute_pooled_embedding_function.cpp',['../permute__pooled__embedding__function_8cpp.html',1,'']]], + ['permute_5fpooled_5fembedding_5fops_2ecu_51',['permute_pooled_embedding_ops.cu',['../permute__pooled__embedding__ops_8cu.html',1,'']]], + ['permute_5fpooled_5fembedding_5fops_2eh_52',['permute_pooled_embedding_ops.h',['../permute__pooled__embedding__ops_8h.html',1,'']]], + ['permute_5fpooled_5fembedding_5fops_5fcpu_2ecpp_53',['permute_pooled_embedding_ops_cpu.cpp',['../permute__pooled__embedding__ops__cpu_8cpp.html',1,'']]], + ['permute_5fpooled_5fembedding_5fops_5fgpu_2ecpp_54',['permute_pooled_embedding_ops_gpu.cpp',['../permute__pooled__embedding__ops__gpu_8cpp.html',1,'']]], + ['permute_5fpooled_5fembedding_5fops_5fsplit_2ecu_55',['permute_pooled_embedding_ops_split.cu',['../permute__pooled__embedding__ops__split_8cu.html',1,'']]], + ['permute_5fpooled_5fembedding_5fops_5fsplit_2eh_56',['permute_pooled_embedding_ops_split.h',['../permute__pooled__embedding__ops__split_8h.html',1,'']]], + ['permute_5fpooled_5fembedding_5fops_5fsplit_5fcpu_2ecpp_57',['permute_pooled_embedding_ops_split_cpu.cpp',['../permute__pooled__embedding__ops__split__cpu_8cpp.html',1,'']]], + ['permute_5fpooled_5fembedding_5fops_5fsplit_5fgpu_2ecpp_58',['permute_pooled_embedding_ops_split_gpu.cpp',['../permute__pooled__embedding__ops__split__gpu_8cpp.html',1,'']]], + ['permute_5fpooled_5fembs_5fauto_5fgrad_59',['permute_pooled_embs_auto_grad',['../group__permute-pooled-embs-cpu.html#ga3fd0766d863a18ea5cce4bfdef6a0349',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fauto_5fgrad_5fcpu_60',['permute_pooled_embs_auto_grad_cpu',['../group__permute-pooled-embs-cpu.html#gac050c22198470709b89b4d5b160006b0',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fauto_5fgrad_5fgpu_61',['permute_pooled_embs_auto_grad_gpu',['../group__permute-pooled-embs-gpu.html#gad0d8a6f85fc81bc54e4c20e60fe6eb11',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fauto_5fgrad_5fmeta_62',['permute_pooled_embs_auto_grad_meta',['../namespacefbgemm__gpu.html#a4381e6e500aad1cf049aa509fc17b16b',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fauto_5fgrad_5fsplit_5fcpu_63',['permute_pooled_embs_auto_grad_split_cpu',['../group__permute-pooled-embs-cpu.html#ga62bb71eb3e7a980ce5efded317717189',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fauto_5fgrad_5fsplit_5fgpu_64',['permute_pooled_embs_auto_grad_split_gpu',['../group__permute-pooled-embs-gpu.html#gab5673b48b58896e4954cc8fc7c90c4d8',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fcpu_65',['permute_pooled_embs_cpu',['../namespacefbgemm__gpu.html#aa321302401045119810e93f42a361f1f',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fcpu_5fimpl_66',['permute_pooled_embs_cpu_impl',['../group__permute-pooled-embs-cpu.html#ga39797562608b1226fc1632f815f7d8a2',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5ffunction_2eh_67',['permute_pooled_embs_function.h',['../permute__pooled__embs__function_8h.html',1,'']]], + ['permute_5fpooled_5fembs_5ffunction_5fsplit_2eh_68',['permute_pooled_embs_function_split.h',['../permute__pooled__embs__function__split_8h.html',1,'']]], + ['permute_5fpooled_5fembs_5fgpu_69',['permute_pooled_embs_gpu',['../namespacefbgemm__gpu.html#a9b4a18abd526ab3e9c95f782d87afbbb',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fgpu_5fimpl_70',['permute_pooled_embs_gpu_impl',['../namespacefbgemm__gpu.html#aca0e73083114d9eea99129e54b89fa23',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fkernel_71',['permute_pooled_embs_kernel',['../layout__transform__ops_8cuh.html#acf1671783450ed8e673d22cbc1d917b5',1,'layout_transform_ops.cuh']]], + ['permute_5fpooled_5fembs_5fmeta_72',['permute_pooled_embs_meta',['../namespacefbgemm__gpu.html#a1183d2ce4456d290df04c32b215fc22e',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fsplit_5fcpu_73',['permute_pooled_embs_split_cpu',['../group__permute-pooled-embs-cpu.html#ga21fd23f8f0de62159529356ebf7eb1f1',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fsplit_5fcpu_5fimpl_74',['permute_pooled_embs_split_cpu_impl',['../namespacefbgemm__gpu.html#a9ce974f08ff3cb46289f39af5ea7fcec',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fsplit_5fgpu_75',['permute_pooled_embs_split_gpu',['../group__permute-pooled-embs-gpu.html#ga342967f8cc4e25c7655d1987536cdc6b',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fsplit_5fgpu_5fimpl_76',['permute_pooled_embs_split_gpu_impl',['../namespacefbgemm__gpu.html#a0d587655a374b11bb6b7febcabe0f403',1,'fbgemm_gpu']]], + ['permute_5fsequence_5fembeddings_5fcpu_77',['permute_sequence_embeddings_cpu',['../namespacefbgemm__gpu.html#a6c601604b9a15b45176ad42d4ca04d7d',1,'fbgemm_gpu']]], + ['permute_5fsequence_5fembeddings_5fcuda_78',['permute_sequence_embeddings_cuda',['../namespacefbgemm__gpu.html#a713a7245a4295a57007802212dca05ee',1,'fbgemm_gpu']]], + ['permute_5fsparse_5ffeatures_5fcpu_79',['permute_sparse_features_cpu',['../namespacefbgemm__gpu.html#a7eec8c74f87d4204857061b761a17ede',1,'fbgemm_gpu']]], + ['permuted_5findices_80',['permuted_indices',['../namespacefbgemm__gpu.html#ab448dead4746a419f7d4a69a32c788ea',1,'fbgemm_gpu']]], + ['permuted_5flengths_5fsize_81',['permuted_lengths_size',['../namespacefbgemm__gpu.html#a77fcd99017c7bb6155d154951f8f45bc',1,'fbgemm_gpu']]], + ['permuted_5fweights_82',['permuted_weights',['../namespacefbgemm__gpu.html#a3035a61c641ca380da28b01558f5fdaa',1,'fbgemm_gpu']]], + ['permutepooledembsfunction_83',['PermutePooledEmbsFunction',['../classfbgemm__gpu_1_1_permute_pooled_embs_function.html',1,'fbgemm_gpu']]], + ['permutepooledembsfunctionsplit_84',['PermutePooledEmbsFunctionSplit',['../classfbgemm__gpu_1_1_permute_pooled_embs_function_split.html',1,'fbgemm_gpu']]], + ['placementtype_85',['PlacementType',['../namespacefbgemm__gpu.html#a8f04cbe33fa88d1e420c06b1f8879194',1,'fbgemm_gpu']]], + ['platform_5fid_86',['PLATFORM_ID',['../_c_make_c_compiler_id_8c.html#adbc5372f40838899018fadbc89bd588b',1,'PLATFORM_ID: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#adbc5372f40838899018fadbc89bd588b',1,'PLATFORM_ID: CMakeCXXCompilerId.cpp']]], + ['pooled_20embeddings_20operators_20cpu_87',['Permute Pooled Embeddings Operators (CPU)',['../group__permute-pooled-embs-cpu.html',1,'']]], + ['pooled_20embeddings_20operators_20cuda_88',['Permute Pooled Embeddings Operators (CUDA)',['../group__permute-pooled-embs-gpu.html',1,'']]], + ['poolingmode_89',['PoolingMode',['../namespacefbgemm__gpu.html#aa1f721fe0d5e5a710e7a05f788f01f5d',1,'fbgemm_gpu']]], + ['pre_5fsigmoid_90',['pre_sigmoid',['../namespacefbgemm__gpu.html#a63c15a2ca68e0a1638710ac9d5335e6a',1,'fbgemm_gpu']]], + ['prefix_5fsum_91',['prefix_sum',['../namespacefbgemm__gpu.html#a82c664395e6340a5878c867fcf278bfc',1,'fbgemm_gpu']]], + ['primitivetype_92',['PrimitiveType',['../namespacefbgemm__gpu.html#aa7e45742197542f659233c21b883ba60',1,'fbgemm_gpu']]], + ['private_5fcase_5ftype_5fcache_93',['PRIVATE_CASE_TYPE_CACHE',['../dispatch__macros_8h.html#ab66dce26ee489c79f3a0441be14902fa',1,'dispatch_macros.h']]], + ['private_5fcase_5ftype_5fcache_5femb_94',['PRIVATE_CASE_TYPE_CACHE_EMB',['../dispatch__macros_8h.html#a98d43954b688bc60b943227d761487b3',1,'dispatch_macros.h']]], + ['private_5fcase_5ftype_5femb_95',['PRIVATE_CASE_TYPE_EMB',['../dispatch__macros_8h.html#af2c9e16b5345c0cdb6611357e0ec15db',1,'dispatch_macros.h']]], + ['private_5fcase_5ftype_5foutput_96',['PRIVATE_CASE_TYPE_OUTPUT',['../dispatch__macros_8h.html#a3905d2ceab136e10c35a2ff4fe29a7d0',1,'dispatch_macros.h']]], + ['private_5fcase_5ftype_5foutput2_97',['PRIVATE_CASE_TYPE_OUTPUT2',['../dispatch__macros_8h.html#a17577aa7f884011133210418a790641a',1,'dispatch_macros.h']]], + ['process_5fall_5findices_5flarge_5fls_98',['process_all_indices_large_Ls',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#ad4f576c80cbb86fce55f5420968bc826',1,'process_all_indices_large_Ls(long *const smem, const uint32_t L, const bool process_d, const bool mean_pooling, const uint32_t params_offset, const uint32_t max_D_cache): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ad4f576c80cbb86fce55f5420968bc826',1,'process_all_indices_large_Ls(long *const smem, const uint32_t L, const bool process_d, const bool mean_pooling, const uint32_t params_offset, const uint32_t max_D_cache): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#ad4f576c80cbb86fce55f5420968bc826',1,'process_all_indices_large_Ls(long *const smem, const uint32_t L, const bool process_d, const bool mean_pooling, const uint32_t params_offset, const uint32_t max_D_cache): embedding_forward_split_kernel_v2_template.cu']]], + ['process_5fall_5findices_5fno_5fpooling_99',['process_all_indices_no_pooling',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a02fb6083bc1f3a1c39dabb7818866a46',1,'process_all_indices_no_pooling(long *const smem, const bool process_d, const uint32_t params_offset): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a02fb6083bc1f3a1c39dabb7818866a46',1,'process_all_indices_no_pooling(long *const smem, const bool process_d, const uint32_t params_offset): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a02fb6083bc1f3a1c39dabb7818866a46',1,'process_all_indices_no_pooling(long *const smem, const bool process_d, const uint32_t params_offset): embedding_forward_split_kernel_v2_template.cu']]], + ['process_5fall_5findices_5fsmall_5fls_100',['process_all_indices_small_Ls',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a871fb6b516157e559e3ed26b56e4245c',1,'process_all_indices_small_Ls(long *const smem, const uint32_t total_L, const bool process_d, const bool mean_pooling, const uint32_t params_offset, const uint32_t max_D_cache): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a871fb6b516157e559e3ed26b56e4245c',1,'process_all_indices_small_Ls(long *const smem, const uint32_t total_L, const bool process_d, const bool mean_pooling, const uint32_t params_offset, const uint32_t max_D_cache): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a871fb6b516157e559e3ed26b56e4245c',1,'process_all_indices_small_Ls(long *const smem, const uint32_t total_L, const bool process_d, const bool mean_pooling, const uint32_t params_offset, const uint32_t max_D_cache): embedding_forward_split_kernel_v2_template.cu']]], + ['producer_5f_101',['producer_',['../classssd_1_1_initializer.html#a94a9376947a96732a7b6de4ca94e7fdd',1,'ssd::Initializer']]], + ['producer_5fqueue_5f_102',['producer_queue_',['../classssd_1_1_initializer.html#a04da45f241a7f5da5ebb52930ed756bc',1,'ssd::Initializer']]], + ['pruned_5farray_5flookup_5fcpu_103',['pruned_array_lookup_cpu',['../group__embedding-cpu.html#ga50d9da3c5bc1fe8b9cabfbda212c2ea5',1,'pruned_array_lookup_cpu(Tensor indices, Tensor offsets, Tensor index_remappings, Tensor index_remappings_offsets): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp'],['../group__embedding-cpu.html#ga50d9da3c5bc1fe8b9cabfbda212c2ea5',1,'pruned_array_lookup_cpu(Tensor indices, Tensor offsets, Tensor index_remappings, Tensor index_remappings_offsets): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp']]], + ['pruned_5farray_5flookup_5fcuda_104',['pruned_array_lookup_cuda',['../group__embedding-cuda.html#gaea1d3ae26d1e893ccf08f8b55b3d6eff',1,'pruned_array_lookup_cuda(Tensor indices, Tensor offsets, Tensor index_remappings, Tensor index_remappings_offsets): embedding_forward_quantized_split_lookup.cu'],['../group__embedding-cuda.html#gaea1d3ae26d1e893ccf08f8b55b3d6eff',1,'pruned_array_lookup_cuda(Tensor indices, Tensor offsets, Tensor index_remappings, Tensor index_remappings_offsets): embedding_forward_quantized_split_lookup.cu']]], + ['pruned_5farray_5flookup_5ffrom_5frow_5fidx_5fcpu_105',['pruned_array_lookup_from_row_idx_cpu',['../namespacefbgemm__gpu.html#ab57019812325465b62248776bb200885',1,'fbgemm_gpu']]], + ['pruned_5farray_5flookup_5ffrom_5frow_5fidx_5fcuda_106',['pruned_array_lookup_from_row_idx_cuda',['../namespacefbgemm__gpu.html#adda552b8784184a2f17aa997e10869f9',1,'fbgemm_gpu']]], + ['pruned_5fhash_5ffunction_107',['pruned_hash_function',['../namespacenbit.html#adf6ceb44691d377239880812db632ef7',1,'nbit']]], + ['pruned_5fhashmap_5finsert_5funweighted_5fcpu_108',['pruned_hashmap_insert_unweighted_cpu',['../group__embedding-cpu.html#ga5b5d3d94a399c14899a4410d1f5e7dad',1,'pruned_hashmap_insert_unweighted_cpu(Tensor indices, Tensor dense_indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp'],['../group__embedding-cpu.html#ga5b5d3d94a399c14899a4410d1f5e7dad',1,'pruned_hashmap_insert_unweighted_cpu(Tensor indices, Tensor dense_indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp']]], + ['pruned_5fhashmap_5finsert_5fweighted_5fcpu_109',['pruned_hashmap_insert_weighted_cpu',['../gen__embedding__forward__quantized__weighted__codegen__cpu_8cpp.html#a446403a1c26f7fecbc1c67fd9be87bf0',1,'gen_embedding_forward_quantized_weighted_codegen_cpu.cpp']]], + ['pruned_5fhashmap_5flookup_5fcuda_110',['pruned_hashmap_lookup_cuda',['../group__embedding-cuda.html#ga1adb0a98306b7d6f839b5fbcaaa44ec7',1,'pruned_hashmap_lookup_cuda(Tensor indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets): embedding_forward_quantized_split_lookup.cu'],['../group__embedding-cuda.html#ga1adb0a98306b7d6f839b5fbcaaa44ec7',1,'pruned_hashmap_lookup_cuda(Tensor indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets): embedding_forward_quantized_split_lookup.cu']]], + ['pruned_5fhashmap_5flookup_5funweighted_5fcpu_111',['pruned_hashmap_lookup_unweighted_cpu',['../group__embedding-cpu.html#ga2c64467f516cc9caf72cb94e9913b211',1,'pruned_hashmap_lookup_unweighted_cpu(Tensor indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp'],['../group__embedding-cpu.html#ga2c64467f516cc9caf72cb94e9913b211',1,'pruned_hashmap_lookup_unweighted_cpu(Tensor indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp']]], + ['pruned_5fhashmap_5flookup_5fweighted_5fcpu_112',['pruned_hashmap_lookup_weighted_cpu',['../gen__embedding__forward__quantized__weighted__codegen__cpu_8cpp.html#ae0d1d716d565d7e70bd253dcd89d7f47',1,'gen_embedding_forward_quantized_weighted_codegen_cpu.cpp']]], + ['pt2_5fcompliant_5ftag_113',['PT2_COMPLIANT_TAG',['../dispatch__macros_8h.html#a3b8ceecef1ba0067d90eea1764298cda',1,'dispatch_macros.h']]], + ['ptr_5f_114',['ptr_',['../memory__utils_8cu.html#afbe2be78a3ee81b2e3c6821cec74e116',1,'memory_utils.cu']]], + ['ptr_5fname_5f_115',['ptr_name_',['../classfbgemm__gpu_1_1_tensor_accessor_base.html#a504eb62b720c68145e6377f6b3eaac16',1,'fbgemm_gpu::TensorAccessorBase::ptr_name_'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#a7023a589c692642eb10fc0c64501a097',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::ptr_name_']]], + ['ptrtype_116',['PtrType',['../structfbgemm__gpu_1_1_default_ptr_traits.html#a931c4685c69254a5749f79cdb56ec814',1,'fbgemm_gpu::DefaultPtrTraits::PtrType'],['../classfbgemm__gpu_1_1_tensor_accessor_base.html#ade0d5b5196750e3a6fd1a8f88c665eb4',1,'fbgemm_gpu::TensorAccessorBase::PtrType'],['../classfbgemm__gpu_1_1_tensor_accessor.html#ade0d5b5196750e3a6fd1a8f88c665eb4',1,'fbgemm_gpu::TensorAccessor::PtrType'],['../classfbgemm__gpu_1_1_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html#ade0d5b5196750e3a6fd1a8f88c665eb4',1,'fbgemm_gpu::TensorAccessor< T, 1, PtrTraits, index_t >::PtrType'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#ade0d5b5196750e3a6fd1a8f88c665eb4',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::PtrType'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor.html#ade0d5b5196750e3a6fd1a8f88c665eb4',1,'fbgemm_gpu::GenericPackedTensorAccessor::PtrType'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html#ade0d5b5196750e3a6fd1a8f88c665eb4',1,'fbgemm_gpu::GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >::PtrType']]] ]; diff --git a/search/all_11.js b/search/all_11.js index d48573eb8..323386e5c 100644 --- a/search/all_11.js +++ b/search/all_11.js @@ -1,10 +1,20 @@ var searchData= [ - ['uvm_5fcuda_5fmem_5fadvise_0',['uvm_cuda_mem_advise',['../group__cumem-utils.html#ga8a7d93d58bcc9700c3054639973e25b6',1,'fbgemm_gpu']]], - ['uvm_5fcuda_5fmem_5fprefetch_5fasync_1',['uvm_cuda_mem_prefetch_async',['../group__cumem-utils.html#ga07e32d271464bafc50cc100cb52ddb85',1,'fbgemm_gpu']]], - ['uvm_5fmem_5fadvice_5fdont_5ffork_2',['uvm_mem_advice_dont_fork',['../group__cumem-utils.html#ga723bf5f1a0ca1c7a77e76054d3332a6e',1,'fbgemm_gpu']]], - ['uvm_5fstorage_3',['uvm_storage',['../group__cumem-utils.html#ga6e119375c731f9e33f4cd81a1f2205e2',1,'fbgemm_gpu']]], - ['uvm_5fto_5fcpu_4',['uvm_to_cpu',['../group__cumem-utils.html#ga6d4781dfa6a77b895140836f6e6d523b',1,'fbgemm_gpu']]], - ['uvm_5fto_5fcpu_5fclone_5',['uvm_to_cpu_clone',['../group__cumem-utils.html#ga98ea4dd0481cc3839cf21e55e003e7af',1,'fbgemm_gpu']]], - ['uvm_5fto_5fdevice_6',['uvm_to_device',['../group__cumem-utils.html#gaad51bd52cc92230c0e91c5d4f61511c2',1,'fbgemm_gpu']]] + ['quantization_20operators_20cuda_0',['Quantization Operators (CUDA)',['../group__quantize-ops-cuda.html',1,'']]], + ['quantize_20data_20cpu_20operators_1',['Quantize Data CPU Operators',['../group__quantize-data-cpu.html',1,'']]], + ['quantize_5fbfloat16_2ecu_2',['quantize_bfloat16.cu',['../quantize__bfloat16_8cu.html',1,'']]], + ['quantize_5ffp8_5frowwise_2ecu_3',['quantize_fp8_rowwise.cu',['../quantize__fp8__rowwise_8cu.html',1,'']]], + ['quantize_5ffused_5f8bit_5frowwise_2ecu_4',['quantize_fused_8bit_rowwise.cu',['../quantize__fused__8bit__rowwise_8cu.html',1,'']]], + ['quantize_5ffused_5fnbit_5frowwise_2ecu_5',['quantize_fused_nbit_rowwise.cu',['../quantize__fused__nbit__rowwise_8cu.html',1,'']]], + ['quantize_5fhfp8_2ecu_6',['quantize_hfp8.cu',['../quantize__hfp8_8cu.html',1,'']]], + ['quantize_5fmsfp_2ecu_7',['quantize_msfp.cu',['../quantize__msfp_8cu.html',1,'']]], + ['quantize_5fops_2ecuh_8',['quantize_ops.cuh',['../quantize__ops_8cuh.html',1,'']]], + ['quantize_5fops_5fcpu_2ecpp_9',['quantize_ops_cpu.cpp',['../quantize__ops__cpu_8cpp.html',1,'']]], + ['quantize_5fops_5fgpu_2ecpp_10',['quantize_ops_gpu.cpp',['../quantize__ops__gpu_8cpp.html',1,'']]], + ['quantize_5fops_5fmax_11',['QUANTIZE_OPS_MAX',['../quantize__ops_2common_8cuh.html#ac84aa8e4e97b2a4675ec853e802ec4c6',1,'common.cuh']]], + ['quantize_5fops_5fmeta_2ecpp_12',['quantize_ops_meta.cpp',['../quantize__ops__meta_8cpp.html',1,'']]], + ['quantize_5fops_5fmin_13',['QUANTIZE_OPS_MIN',['../quantize__ops_2common_8cuh.html#a7c9f79708fed845d68b88205e5a1c70c',1,'common.cuh']]], + ['quantize_5fops_5futils_2eh_14',['quantize_ops_utils.h',['../quantize__ops__utils_8h.html',1,'']]], + ['quantize_5fpadded_5ffp8_5frowwise_2ecu_15',['quantize_padded_fp8_rowwise.cu',['../quantize__padded__fp8__rowwise_8cu.html',1,'']]], + ['quantize_5fstore_16',['quantize_store',['../namespacefbgemm__gpu.html#af5bbc85156e52ab097bb0f770a2f63e7',1,'fbgemm_gpu']]] ]; diff --git a/search/all_12.js b/search/all_12.js new file mode 100644 index 000000000..f05f0f23f --- /dev/null +++ b/search/all_12.js @@ -0,0 +1,36 @@ +var searchData= +[ + ['radix_5fsort_5fpairs_2ecu_0',['radix_sort_pairs.cu',['../radix__sort__pairs_8cu.html',1,'']]], + ['range_5fdata_1',['range_data',['../namespacefbgemm__gpu.html#aef9d86cd563a5416a6c556a5902c966d',1,'fbgemm_gpu']]], + ['range_5fsize_2',['range_size',['../namespacefbgemm__gpu.html#ad7972a8cfd2b4fbe5e0b5b29f12beaa7',1,'fbgemm_gpu']]], + ['recalibrate_5fvalue_3',['recalibrate_value',['../namespacefbgemm__gpu.html#a6b36a55458d7d4b9024fd515605c29ee',1,'fbgemm_gpu']]], + ['recat_5fcopy_5fasync_5fkernel_4',['recat_copy_async_kernel',['../layout__transform__ops_8cuh.html#a2f3c62685f843be282e18a9805d8ad5c',1,'layout_transform_ops.cuh']]], + ['recat_5fembedding_5fgrad_5foutput_5fcuda_5',['recat_embedding_grad_output_cuda',['../group__layout-transform-cuda.html#ga09438223bb710af7f55fb6d25fc9d99f',1,'fbgemm_gpu']]], + ['recat_5fembedding_5fgrad_5foutput_5fmixed_5fd_5fbatch_5fcuda_6',['recat_embedding_grad_output_mixed_D_batch_cuda',['../group__layout-transform-cuda.html#gad5cabc0ba0ee6dfd8a8de4e5825c62e9',1,'fbgemm_gpu']]], + ['recat_5fembedding_5fgrad_5foutput_5fmixed_5fd_5fcpu_7',['recat_embedding_grad_output_mixed_D_cpu',['../group__layout-transform-cpu.html#ga8edc2bee42577b7eeb76613b52d62311',1,'fbgemm_gpu']]], + ['recat_5fembedding_5fgrad_5foutput_5fmixed_5fd_5fcuda_8',['recat_embedding_grad_output_mixed_D_cuda',['../group__layout-transform-cuda.html#gaf753887183c2603a01978463228a0343',1,'fbgemm_gpu']]], + ['registration_5flist_9',['registration_list',['../classfbgemm__gpu_1_1enum__registration.html#afbf71e4018b8f6bf7ff11e50f3aeed14',1,'fbgemm_gpu::enum_registration']]], + ['reorder_5fbatched_5fad_5findices_5fcpu_10',['reorder_batched_ad_indices_cpu',['../namespacefbgemm__gpu.html#a71657f0dff28b74e6cb71f2e70adba96',1,'fbgemm_gpu']]], + ['reorder_5fbatched_5fad_5findices_5fcpu_5f_11',['reorder_batched_ad_indices_cpu_',['../namespacefbgemm__gpu.html#abe2eef805cfc20b2d3ba69e3db973688',1,'fbgemm_gpu']]], + ['reorder_5fbatched_5fad_5findices_5fgpu_12',['reorder_batched_ad_indices_gpu',['../namespacefbgemm__gpu.html#a10ae2e750abd260fb3dc2deb5e6a10a6',1,'fbgemm_gpu']]], + ['reorder_5fbatched_5fad_5flengths_5f_13',['reorder_batched_ad_lengths_',['../namespacefbgemm__gpu.html#a87472f171b785c3735bc88d72c8ddd9e',1,'fbgemm_gpu']]], + ['reorder_5fbatched_5fad_5flengths_5fcpu_14',['reorder_batched_ad_lengths_cpu',['../namespacefbgemm__gpu.html#aee6a046b2315137787cced8d9942a248',1,'fbgemm_gpu']]], + ['reorder_5fbatched_5fad_5flengths_5fgpu_15',['reorder_batched_ad_lengths_gpu',['../namespacefbgemm__gpu.html#af398efd1fa34f78e6882f7691aa99fa9',1,'fbgemm_gpu']]], + ['report_5fembedding_5ferror_16',['report_embedding_error',['../namespacefbgemm__gpu.html#a17e57fc2dca2d6df09e26f3eec69464c',1,'fbgemm_gpu']]], + ['reset_17',['reset',['../structfbgemm__gpu_1_1_vec4_acc_t.html#a290527af29e033f3ed6f5464ded1b07e',1,'fbgemm_gpu::Vec4AccT']]], + ['reset_5fweight_5fmomentum_2ecu_18',['reset_weight_momentum.cu',['../reset__weight__momentum_8cu.html',1,'']]], + ['reset_5fweight_5fmomentum_5fcuda_19',['reset_weight_momentum_cuda',['../group__table-batched-embed-cuda.html#ga59334fdad832f8d67576e6c83a9b9d79',1,'reset_weight_momentum_cuda(at::Tensor dev_weights, at::Tensor uvm_weights, at::Tensor lxu_cache_weights, at::Tensor weights_placements, at::Tensor weights_offsets, at::Tensor momentum1_dev, at::Tensor momentum1_uvm, at::Tensor momentum1_placements, at::Tensor momentum1_offsets, at::Tensor D_offsets, at::Tensor pruned_indices, at::Tensor pruned_indices_offsets, at::Tensor logical_table_ids, at::Tensor buffer_ids, at::Tensor cache_hash_size_cumsum, at::Tensor lxu_cache_state, int64_t total_cache_hash_size): reset_weight_momentum.cu'],['../group__table-batched-embed-cuda.html#ga59334fdad832f8d67576e6c83a9b9d79',1,'reset_weight_momentum_cuda(Tensor dev_weights, Tensor uvm_weights, Tensor lxu_cache_weights, Tensor weights_placements, Tensor weights_offsets, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor D_offsets, Tensor pruned_indices, Tensor pruned_indices_offsets, Tensor logical_table_ids, Tensor buffer_ids, Tensor cache_hash_size_cumsum, Tensor lxu_cache_state, int64_t total_cache_hash_size): reset_weight_momentum.cu']]], + ['right_20',['right',['../namespacefbgemm__gpu.html#a2f54f8b71f0d765e2b7dbd9a8b9774ff',1,'fbgemm_gpu']]], + ['rk_5fdouble_21',['rk_double',['../namespacefbgemm__gpu.html#af9dc4afe0a87b2326caf53649eee20eb',1,'fbgemm_gpu']]], + ['rk_5frandom_22',['rk_random',['../namespacefbgemm__gpu.html#a3914fbd6fed76ebe8d05a1967ec5ccb9',1,'fbgemm_gpu']]], + ['rk_5fseed_23',['rk_seed',['../namespacefbgemm__gpu.html#ad56b0e8dd76a57dcc1e268831fe58abb',1,'fbgemm_gpu']]], + ['rk_5fstate_24',['rk_state',['../structfbgemm__gpu_1_1rk__state.html',1,'fbgemm_gpu']]], + ['rk_5fzipf_25',['rk_zipf',['../namespacefbgemm__gpu.html#ac4468c32ea6dc23cc2d7bded57a53119',1,'fbgemm_gpu']]], + ['round_5fdown_26',['round_down',['../namespacefbgemm__gpu.html#afad69123afbd407f6cd94913da47680e',1,'fbgemm_gpu']]], + ['round_5fup_27',['round_up',['../namespacenbit.html#a3f668dd605c2700542424899b9df54c6',1,'nbit']]], + ['row_5f_28',['row_',['../structfbgemm__gpu_1_1_weight_row.html#aba84449b569f220a80ccbbcc1d4da57c',1,'fbgemm_gpu::WeightRow']]], + ['row_5findices_29',['row_indices',['../structinternal_1_1_hyper_compressed_sparse_column.html#a22af9d871fd3faef3d676cc6757debcc',1,'internal::HyperCompressedSparseColumn']]], + ['row_5fstart_30',['row_start',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a91f4b44299546e7bea8da7a89cff344e',1,'row_start: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a91f4b44299546e7bea8da7a89cff344e',1,'row_start: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['row_5fstorage_5f_31',['row_storage_',['../classssd_1_1_initializer.html#a3f2b57f32ee510408e83a7c26716d7a5',1,'ssd::Initializer']]], + ['run_5femulate_5fcache_5fmiss_32',['run_emulate_cache_miss',['../uvm__cache__miss__emulate__test_8cpp.html#ac9959da4e8495e9b74415473535a9c3e',1,'uvm_cache_miss_emulate_test.cpp']]] +]; diff --git a/search/all_13.js b/search/all_13.js new file mode 100644 index 000000000..235f9af1a --- /dev/null +++ b/search/all_13.js @@ -0,0 +1,229 @@ +var searchData= +[ + ['saved_5fparams_0',['SAVED_PARAMS',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54',1,'SAVED_PARAMS: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54',1,'SAVED_PARAMS: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54',1,'SAVED_PARAMS: embedding_forward_split_kernel_v2_template.cu']]], + ['saved_5fparams_5fcnt_1',['SAVED_PARAMS_CNT',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a777533551368ab4bfca5c1c8083e3e89',1,'SAVED_PARAMS_CNT: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a777533551368ab4bfca5c1c8083e3e89',1,'SAVED_PARAMS_CNT: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['seg_5fend_2',['seg_end',['../namespacefbgemm__gpu.html#a4311f4976e51399caed297d2cad3bfd3',1,'fbgemm_gpu']]], + ['seg_5fstart_3',['seg_start',['../namespacefbgemm__gpu.html#adc735e446799084e3d27da58cf5807c3',1,'fbgemm_gpu']]], + ['segment_5fend_4',['segment_end',['../namespacefbgemm__gpu.html#a00965ae9e18f8292077b81d9040515c0',1,'fbgemm_gpu']]], + ['segment_5foffsets_5fdata_5',['segment_offsets_data',['../namespacefbgemm__gpu.html#a091bd2259a1e959d0052ad2fa399065f',1,'fbgemm_gpu']]], + ['segment_5fstart_6',['segment_start',['../namespacefbgemm__gpu.html#aa58de74ea57ed45322b04e829cb75d9b',1,'fbgemm_gpu']]], + ['segment_5fsum_5fcsr_5fcpu_7',['segment_sum_csr_cpu',['../namespacefbgemm__gpu.html#a678327561759694192908f1f111424f7',1,'fbgemm_gpu']]], + ['segment_5fsum_5fcsr_5fcuda_8',['segment_sum_csr_cuda',['../namespacefbgemm__gpu.html#a8ae9711da44e5cd4a81f95a762b41180',1,'fbgemm_gpu']]], + ['segment_5fvalue_5fdata_9',['segment_value_data',['../namespacefbgemm__gpu.html#ac49066d09ce07fcb75c1f913da32b626',1,'fbgemm_gpu']]], + ['set_10',['set',['../classssd_1_1_embedding_rocks_d_b.html#a1951c5647b663fc955ee1076f68190ec',1,'ssd::EmbeddingRocksDB']]], + ['set_5fcuda_11',['set_cuda',['../classssd_1_1_embedding_rocks_d_b.html#a1b6c5343b7eafae73491f0749f1151a9',1,'ssd::EmbeddingRocksDB']]], + ['set_5fstochastic_5frounding_12',['set_stochastic_rounding',['../structfbgemm__gpu_1_1_weight_row.html#a4548dbb10be8705cf81e3e2362f1cea3',1,'fbgemm_gpu::WeightRow']]], + ['sharedmemory_13',['SharedMemory',['../structfbgemm__gpu_1_1_shared_memory.html',1,'fbgemm_gpu']]], + ['sharedmemory_3c_20double_20_3e_14',['SharedMemory< double >',['../structfbgemm__gpu_1_1_shared_memory_3_01double_01_4.html',1,'fbgemm_gpu']]], + ['sharedmemory_3c_20float_20_3e_15',['SharedMemory< float >',['../structfbgemm__gpu_1_1_shared_memory_3_01float_01_4.html',1,'fbgemm_gpu']]], + ['sharedmemory_3c_20int32_5ft_20_3e_16',['SharedMemory< int32_t >',['../structfbgemm__gpu_1_1_shared_memory_3_01int32__t_01_4.html',1,'fbgemm_gpu']]], + ['sharedmemory_3c_20int64_5ft_20_3e_17',['SharedMemory< int64_t >',['../structfbgemm__gpu_1_1_shared_memory_3_01int64__t_01_4.html',1,'fbgemm_gpu']]], + ['sharedmemory_3c_20vec4t_3c_20at_3a_3aacc_5ftype_3c_20double_2c_20true_20_3e_20_3e_20_3e_18',['SharedMemory< Vec4T< at::acc_type< double, true > > >',['../structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01double_00_01true_01_4_01_4_01_4.html',1,'fbgemm_gpu']]], + ['sharedmemory_3c_20vec4t_3c_20at_3a_3aacc_5ftype_3c_20float_2c_20true_20_3e_20_3e_20_3e_19',['SharedMemory< Vec4T< at::acc_type< float, true > > >',['../structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01float_00_01true_01_4_01_4_01_4.html',1,'fbgemm_gpu']]], + ['shfl_5fdown_5fsync_20',['shfl_down_sync',['../namespacefbgemm__gpu.html#a52eb62356a603284f18652bc195274ea',1,'fbgemm_gpu']]], + ['shfl_5fsync_21',['SHFL_SYNC',['../embedding__forward__template__helpers_8cuh.html#adce6eee5db9c1c3f52ff15d9fe263495',1,'SHFL_SYNC: embedding_forward_template_helpers.cuh'],['../embedding__backward__template__helpers_8cuh.html#adce6eee5db9c1c3f52ff15d9fe263495',1,'SHFL_SYNC: embedding_backward_template_helpers.cuh']]], + ['shfl_5fsync_22',['shfl_sync',['../namespacefbgemm__gpu.html#a9b3fcf49a28b6524c8db8c7c523e1798',1,'fbgemm_gpu']]], + ['shfl_5fxor_23',['shfl_xor',['../namespacefbgemm__gpu.html#a17b07e8668ed9b29a8b37d21a829723d',1,'fbgemm_gpu']]], + ['should_5fprune_24',['should_prune',['../namespacefbgemm__gpu.html#a4ae09e478c1e9d6a414935fb6cf60f99',1,'fbgemm_gpu']]], + ['size_25',['size',['../classfbgemm__gpu_1_1_tensor_accessor_base.html#a53408e729e4cd52d06e5c577afbfcf9d',1,'fbgemm_gpu::TensorAccessorBase::size()'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#a53408e729e4cd52d06e5c577afbfcf9d',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::size()']]], + ['sizes_26',['sizes',['../classfbgemm__gpu_1_1_tensor_accessor_base.html#a5b7afa180d3bd84115f26a365b167e5e',1,'fbgemm_gpu::TensorAccessorBase']]], + ['sizes_5f_27',['sizes_',['../classfbgemm__gpu_1_1_tensor_accessor_base.html#a3665ab1adc4a5618fa5e22e00ff0e848',1,'fbgemm_gpu::TensorAccessorBase::sizes_'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#ac2dd270bd9c520d7599dbc5626642cd9',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::sizes_']]], + ['sl_28',['SL',['../namespacefbgemm__gpu.html#a4478543eef2b1a98a328e4c634b5f6ad',1,'fbgemm_gpu']]], + ['smem_29',['smem',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a843d0aea30f5cc9663eb720c3dd003ce',1,'smem: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a843d0aea30f5cc9663eb720c3dd003ce',1,'smem: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['smem_5fcache_5fweight_5fdata_30',['SMEM_CACHE_WEIGHT_DATA',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a9e947cce4a2cf3d4f94feeaf6024a3e3',1,'SMEM_CACHE_WEIGHT_DATA: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a9e947cce4a2cf3d4f94feeaf6024a3e3',1,'SMEM_CACHE_WEIGHT_DATA: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a9e947cce4a2cf3d4f94feeaf6024a3e3',1,'SMEM_CACHE_WEIGHT_DATA: embedding_forward_split_kernel_v2_template.cu']]], + ['smem_5fcache_5fweight_5fptr_31',['SMEM_CACHE_WEIGHT_PTR',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a8f5221b4fcc0397e5c260e567afd000f',1,'SMEM_CACHE_WEIGHT_PTR: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a8f5221b4fcc0397e5c260e567afd000f',1,'SMEM_CACHE_WEIGHT_PTR: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a8f5221b4fcc0397e5c260e567afd000f',1,'SMEM_CACHE_WEIGHT_PTR: embedding_forward_split_kernel_v2_template.cu']]], + ['smem_5femb_5fweight_5fdata_32',['SMEM_EMB_WEIGHT_DATA',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a741fa81333f21f397dd7bcb524345f77',1,'SMEM_EMB_WEIGHT_DATA: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a741fa81333f21f397dd7bcb524345f77',1,'SMEM_EMB_WEIGHT_DATA: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a741fa81333f21f397dd7bcb524345f77',1,'SMEM_EMB_WEIGHT_DATA: embedding_forward_split_kernel_v2_template.cu']]], + ['smem_5femb_5fweight_5fptr_33',['SMEM_EMB_WEIGHT_PTR',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a240239b93a27d2333aba0661096e3f2f',1,'SMEM_EMB_WEIGHT_PTR: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a240239b93a27d2333aba0661096e3f2f',1,'SMEM_EMB_WEIGHT_PTR: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a240239b93a27d2333aba0661096e3f2f',1,'SMEM_EMB_WEIGHT_PTR: embedding_forward_split_kernel_v2_template.cu']]], + ['smem_5fgeneric_5fptr_34',['SMEM_GENERIC_PTR',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a82d4ccecee745b4cadb5d2d04e986efc',1,'SMEM_GENERIC_PTR: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a82d4ccecee745b4cadb5d2d04e986efc',1,'SMEM_GENERIC_PTR: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a82d4ccecee745b4cadb5d2d04e986efc',1,'SMEM_GENERIC_PTR: embedding_forward_split_kernel_v2_template.cu']]], + ['smem_5foffset_35',['SMEM_OFFSET',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a5c92b1dfe0de84f52323da3897cb0bb4',1,'SMEM_OFFSET: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a5c92b1dfe0de84f52323da3897cb0bb4',1,'SMEM_OFFSET: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a5c92b1dfe0de84f52323da3897cb0bb4',1,'SMEM_OFFSET: embedding_forward_split_kernel_v2_template.cu']]], + ['smem_5fptr_5fbase_36',['SMEM_PTR_BASE',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aec0d9a647d3bde05780ff426af9ebf45',1,'SMEM_PTR_BASE: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aec0d9a647d3bde05780ff426af9ebf45',1,'SMEM_PTR_BASE: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#aec0d9a647d3bde05780ff426af9ebf45',1,'SMEM_PTR_BASE: embedding_forward_split_kernel_v2_template.cu']]], + ['sort_37',['sort',['../structfbgemm__gpu_1_1_bitonic_sort.html#ae729c535b885ed8e2aca6d99ef51e4b0',1,'fbgemm_gpu::BitonicSort']]], + ['sorted_5finfos_38',['sorted_infos',['../namespacefbgemm__gpu.html#a89d9dff100cfa1f022fcfbf61e2500cc',1,'fbgemm_gpu']]], + ['sorted_5flinear_5findices_5fcumulative_5frun_5flengths_39',['sorted_linear_indices_cumulative_run_lengths',['../namespacefbgemm__gpu.html#ae6972dc3932ca715765452e39f97f21b',1,'fbgemm_gpu']]], + ['sorted_5flinear_5findices_5fnum_5fruns_40',['sorted_linear_indices_num_runs',['../namespacefbgemm__gpu.html#a9531de3506c1c1753051c949613ee1b5',1,'fbgemm_gpu']]], + ['sorted_5flinear_5findices_5frun_41',['sorted_linear_indices_run',['../namespacefbgemm__gpu.html#a30d761b81b0e05f95a7a118a17d6c4a2',1,'fbgemm_gpu']]], + ['sorted_5flxu_5fcache_5flocations_42',['sorted_lxu_cache_locations',['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#a219575ab5da90e4fa43bbb6df6e7831b',1,'gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['sparse_20data_20cpu_20operators_43',['Sparse Data CPU Operators',['../group__sparse-data-cpu.html',1,'']]], + ['sparse_20data_20cuda_20operators_44',['Sparse Data CUDA Operators',['../group__sparse-data-cuda.html',1,'']]], + ['sparse_5fasync_5fcumsum_2ecu_45',['sparse_async_cumsum.cu',['../sparse__async__cumsum_8cu.html',1,'']]], + ['sparse_5fbatched_5funary_5fembeddings_2ecu_46',['sparse_batched_unary_embeddings.cu',['../sparse__batched__unary__embeddings_8cu.html',1,'']]], + ['sparse_5fblock_5fbucketize_5ffeatures_2ecu_47',['sparse_block_bucketize_features.cu',['../sparse__block__bucketize__features_8cu.html',1,'']]], + ['sparse_5fbucketize_5ffeatures_2ecu_48',['sparse_bucketize_features.cu',['../sparse__bucketize__features_8cu.html',1,'']]], + ['sparse_5fcompute_5ffrequency_5fsequence_2ecu_49',['sparse_compute_frequency_sequence.cu',['../sparse__compute__frequency__sequence_8cu.html',1,'']]], + ['sparse_5fexpand_5finto_5fjagged_5fpermute_2ecu_50',['sparse_expand_into_jagged_permute.cu',['../sparse__expand__into__jagged__permute_8cu.html',1,'']]], + ['sparse_5fgroup_5findex_2ecu_51',['sparse_group_index.cu',['../sparse__group__index_8cu.html',1,'']]], + ['sparse_5findex_5fadd_2ecu_52',['sparse_index_add.cu',['../sparse__index__add_8cu.html',1,'']]], + ['sparse_5findex_5fselect_2ecu_53',['sparse_index_select.cu',['../sparse__index__select_8cu.html',1,'']]], + ['sparse_5finvert_5fpermute_2ecu_54',['sparse_invert_permute.cu',['../sparse__invert__permute_8cu.html',1,'']]], + ['sparse_5fops_2ecuh_55',['sparse_ops.cuh',['../sparse__ops_8cuh.html',1,'']]], + ['sparse_5fops_2eh_56',['sparse_ops.h',['../sparse__ops_8h.html',1,'']]], + ['sparse_5fops_5fcpu_2ecpp_57',['sparse_ops_cpu.cpp',['../sparse__ops__cpu_8cpp.html',1,'']]], + ['sparse_5fops_5fgpu_2ecpp_58',['sparse_ops_gpu.cpp',['../sparse__ops__gpu_8cpp.html',1,'']]], + ['sparse_5fops_5fmeta_2ecpp_59',['sparse_ops_meta.cpp',['../sparse__ops__meta_8cpp.html',1,'']]], + ['sparse_5fops_5futils_2eh_60',['sparse_ops_utils.h',['../sparse__ops__utils_8h.html',1,'']]], + ['sparse_5fops_5futils_5ftest_2ecpp_61',['sparse_ops_utils_test.cpp',['../sparse__ops__utils__test_8cpp.html',1,'']]], + ['sparse_5fpack_5fsegments_5fbackward_2ecu_62',['sparse_pack_segments_backward.cu',['../sparse__pack__segments__backward_8cu.html',1,'']]], + ['sparse_5fpack_5fsegments_5fforward_2ecu_63',['sparse_pack_segments_forward.cu',['../sparse__pack__segments__forward_8cu.html',1,'']]], + ['sparse_5fpermute102_2ecu_64',['sparse_permute102.cu',['../sparse__permute102_8cu.html',1,'']]], + ['sparse_5fpermute_5f1d_2ecu_65',['sparse_permute_1d.cu',['../sparse__permute__1d_8cu.html',1,'']]], + ['sparse_5fpermute_5f2d_2ecu_66',['sparse_permute_2d.cu',['../sparse__permute__2d_8cu.html',1,'']]], + ['sparse_5fpermute_5fembeddings_2ecu_67',['sparse_permute_embeddings.cu',['../sparse__permute__embeddings_8cu.html',1,'']]], + ['sparse_5frange_2ecu_68',['sparse_range.cu',['../sparse__range_8cu.html',1,'']]], + ['sparse_5freorder_5fbatched_5fad_2ecu_69',['sparse_reorder_batched_ad.cu',['../sparse__reorder__batched__ad_8cu.html',1,'']]], + ['sparse_5fsegment_5fsum_5fcsr_2ecu_70',['sparse_segment_sum_csr.cu',['../sparse__segment__sum__csr_8cu.html',1,'']]], + ['sparse_5fzipf_2ecu_71',['sparse_zipf.cu',['../sparse__zipf_8cu.html',1,'']]], + ['sparsetype_72',['SparseType',['../namespacefbgemm__gpu.html#a47b4476e5f749d63e15d2f8e55be833e',1,'fbgemm_gpu']]], + ['split_5fadagrad_5ftable_5fupdate_5fkernel_73',['split_adagrad_table_update_kernel',['../gen__embedding__optimizer__adagrad__split__device__kernel_8cuh.html#aae2b7a37c2c14a8e8575336d88932f5e',1,'gen_embedding_optimizer_adagrad_split_device_kernel.cuh']]], + ['split_5fadam_5ftable_5fupdate_5fkernel_74',['split_adam_table_update_kernel',['../gen__embedding__optimizer__adam__split__device__kernel_8cuh.html#a415ebd6751961f1e6826cfe2712cc85e',1,'gen_embedding_optimizer_adam_split_device_kernel.cuh']]], + ['split_5fapprox_5frowwise_5fadagrad_5ftable_5fupdate_5fkernel_75',['split_approx_rowwise_adagrad_table_update_kernel',['../gen__embedding__optimizer__approx__rowwise__adagrad__split__device__kernel_8cuh.html#a9263ef077d631b455021b5cfe68d9632',1,'gen_embedding_optimizer_approx_rowwise_adagrad_split_device_kernel.cuh']]], + ['split_5fapprox_5frowwise_5fadagrad_5fwith_5fcounter_5ftable_5fupdate_5fkernel_76',['split_approx_rowwise_adagrad_with_counter_table_update_kernel',['../gen__embedding__optimizer__approx__rowwise__adagrad__with__counter__split__device__kernel_8cuh.html#a2f7931888711cbd1dff1f7fda564b3a5',1,'gen_embedding_optimizer_approx_rowwise_adagrad_with_counter_split_device_kernel.cuh']]], + ['split_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5ftable_5fupdate_5fkernel_77',['split_approx_rowwise_adagrad_with_weight_decay_table_update_kernel',['../gen__embedding__optimizer__approx__rowwise__adagrad__with__weight__decay__split__device__kernel_8cuh.html#a30fdc78bf391825590b69585779a9baf',1,'gen_embedding_optimizer_approx_rowwise_adagrad_with_weight_decay_split_device_kernel.cuh']]], + ['split_5fapprox_5fsgd_5ftable_5fupdate_5fkernel_78',['split_approx_sgd_table_update_kernel',['../gen__embedding__optimizer__approx__sgd__split__device__kernel_8cuh.html#abcf3f2a323ec4155270a5fcfffecd462',1,'gen_embedding_optimizer_approx_sgd_split_device_kernel.cuh']]], + ['split_5fdense_5ftable_5fupdate_5fkernel_79',['split_dense_table_update_kernel',['../gen__embedding__optimizer__dense__split__device__kernel_8cuh.html#a9a55851e1eec2af9f174c94e138a4aa7',1,'gen_embedding_optimizer_dense_split_device_kernel.cuh']]], + ['split_5fembedding_80',['split_embedding',['../embedding__backward__split__host__template_8cpp.html#a099fcb1910d50cb2f7bcfd36966c67f3',1,'embedding_backward_split_host_template.cpp']]], + ['split_5fembedding_5f_81',['split_embedding_',['../embedding__optimizer__split__host__template_8cpp.html#a043dbacfe97bbbca3dfe0675f0073939',1,'embedding_optimizer_split_host_template.cpp']]], + ['split_5fembedding_5fbackward_5fcodegen_5fadagrad_5fcpu_82',['split_embedding_backward_codegen_adagrad_cpu',['../gen__embedding__backward__adagrad__split__cpu_8cpp.html#a5e9389fec0497e9f90df6043627319ca',1,'split_embedding_backward_codegen_adagrad_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, Tensor momentum1_host, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_adagrad_split_cpu.cpp'],['../gen__embedding__backward__split__adagrad__cpu_8cpp.html#a5e9389fec0497e9f90df6043627319ca',1,'split_embedding_backward_codegen_adagrad_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, Tensor momentum1_host, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_adagrad_split_cpu.cpp']]], + ['split_5fembedding_5fbackward_5fcodegen_5fadagrad_5funweighted_5fexact_5fcuda_83',['split_embedding_backward_codegen_adagrad_unweighted_exact_cuda',['../gen__embedding__backward__adagrad__split__unweighted__cuda_8cu.html#a1207210a9545e9575750541d0b87d2ff',1,'split_embedding_backward_codegen_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate): gen_embedding_backward_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__adagrad_8cpp.html#a06b1cf5ad03a298c5257a31b33524398',1,'split_embedding_backward_codegen_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0): gen_embedding_backward_adagrad_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fadagrad_5fweighted_5fexact_5fcuda_84',['split_embedding_backward_codegen_adagrad_weighted_exact_cuda',['../gen__embedding__backward__adagrad__split__weighted__cuda_8cu.html#a0e8cc9d4217b55864ac828677d7d546d',1,'split_embedding_backward_codegen_adagrad_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate): gen_embedding_backward_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__split__adagrad_8cpp.html#affb9be553e49e7bea6a6c3f60b63dc04',1,'split_embedding_backward_codegen_adagrad_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0): gen_embedding_backward_adagrad_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fadam_5funweighted_5fexact_5fcuda_85',['split_embedding_backward_codegen_adam_unweighted_exact_cuda',['../gen__embedding__backward__adam__split__unweighted__cuda_8cu.html#a7c3fa518fa48a831ea3f8e691672808e',1,'split_embedding_backward_codegen_adam_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__adam_8cpp.html#ae27a3d26d13d596aaaa1e621990e0d71',1,'split_embedding_backward_codegen_adam_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_adam_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fadam_5fweighted_5fexact_5fcuda_86',['split_embedding_backward_codegen_adam_weighted_exact_cuda',['../gen__embedding__backward__adam__split__weighted__cuda_8cu.html#aea34407b88c9df5b3be55e8ea24a347d',1,'split_embedding_backward_codegen_adam_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_cuda.cu'],['../gen__embedding__backward__split__adam_8cpp.html#a8e4ae3bed221149c3b3ab6a5c0f38605',1,'split_embedding_backward_codegen_adam_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_adam_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5funweighted_5fexact_5fcuda_87',['split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html#a346e3b137705a7c27ea4448090c853ca',1,'split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html#afbce26182226d45104cf25fc6ebf90df',1,'split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fweighted_5fexact_5fcuda_88',['split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_exact_cuda',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html#a1ff3b73be256bfc5b6a6a92c35f5c101',1,'split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html#ae5ec715aff7b59ae2cd64991053a8744',1,'split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fdense_5fcpu_89',['split_embedding_backward_codegen_dense_cpu',['../gen__embedding__backward__dense__split__cpu_8cpp.html#a9872de3651e55555a2bea1c407c45c5d',1,'split_embedding_backward_codegen_dense_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, double unused=0): gen_embedding_backward_dense_split_cpu.cpp'],['../embedding__backward__dense__host__cpu_8cpp.html#a16114b295cd4bb55fd704d1cc575284f',1,'split_embedding_backward_codegen_dense_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, double unused): gen_embedding_backward_dense_split_cpu.cpp']]], + ['split_5fembedding_5fbackward_5fcodegen_5fdense_5funweighted_5fexact_5fcuda_90',['split_embedding_backward_codegen_dense_unweighted_exact_cuda',['../gen__embedding__backward__dense__split__unweighted__cuda_8cu.html#af39484621a2a43237ee275c7d9497e16',1,'split_embedding_backward_codegen_dense_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const int64_t unused_, const int64_t max_segment_length_per_warp, double unused): gen_embedding_backward_dense_split_unweighted_cuda.cu'],['../embedding__backward__dense__host_8cpp.html#aebdb9ab2fd0166beebd42528ea223ac4',1,'split_embedding_backward_codegen_dense_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const double unused): gen_embedding_backward_dense_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fdense_5fweighted_5fexact_5fcuda_91',['split_embedding_backward_codegen_dense_weighted_exact_cuda',['../gen__embedding__backward__dense__split__weighted__cuda_8cu.html#aeae20f9c1a93bb4297f2710fe00723a2',1,'split_embedding_backward_codegen_dense_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const int64_t unused_, const int64_t max_segment_length_per_warp, double unused): gen_embedding_backward_dense_split_weighted_cuda.cu'],['../embedding__backward__dense__host_8cpp.html#a4a920500b84d7febde7964cfa515c690',1,'split_embedding_backward_codegen_dense_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const double unused): gen_embedding_backward_dense_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5flamb_5funweighted_5fexact_5fcuda_92',['split_embedding_backward_codegen_lamb_unweighted_exact_cuda',['../gen__embedding__backward__lamb__split__unweighted__cuda_8cu.html#a45b16bde5dcd4ed361824c02fb19aa28',1,'split_embedding_backward_codegen_lamb_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__lamb_8cpp.html#abafaac43ca0a5d04be6280c0db92ef81',1,'split_embedding_backward_codegen_lamb_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_lamb_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5flamb_5fweighted_5fexact_5fcuda_93',['split_embedding_backward_codegen_lamb_weighted_exact_cuda',['../gen__embedding__backward__lamb__split__weighted__cuda_8cu.html#ac26e29ea75fba6b9f3922118cd293b96',1,'split_embedding_backward_codegen_lamb_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_cuda.cu'],['../gen__embedding__backward__split__lamb_8cpp.html#a2cb504a8487e7581fcf600c9dd9bb4da',1,'split_embedding_backward_codegen_lamb_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_lamb_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5flars_5fsgd_5funweighted_5fexact_5fcuda_94',['split_embedding_backward_codegen_lars_sgd_unweighted_exact_cuda',['../gen__embedding__backward__lars__sgd__split__unweighted__cuda_8cu.html#a68717d5b465de7efb3f58ca7f1c9c48e',1,'split_embedding_backward_codegen_lars_sgd_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double learning_rate, double eta, double momentum, double weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__lars__sgd_8cpp.html#ad6a69a83e0c09e08c8854f3a988349c2',1,'split_embedding_backward_codegen_lars_sgd_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double learning_rate=0, double eta=0, double momentum=0, double weight_decay=0): gen_embedding_backward_lars_sgd_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5flars_5fsgd_5fweighted_5fexact_5fcuda_95',['split_embedding_backward_codegen_lars_sgd_weighted_exact_cuda',['../gen__embedding__backward__lars__sgd__split__weighted__cuda_8cu.html#a3eff146e8f81f6d6dcc6e08f791b1c27',1,'split_embedding_backward_codegen_lars_sgd_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double learning_rate, double eta, double momentum, double weight_decay): gen_embedding_backward_lars_sgd_split_weighted_cuda.cu'],['../gen__embedding__backward__split__lars__sgd_8cpp.html#a592a95a9e623ca87fb31c88bc11ef217',1,'split_embedding_backward_codegen_lars_sgd_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double learning_rate=0, double eta=0, double momentum=0, double weight_decay=0): gen_embedding_backward_lars_sgd_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fnone_5funweighted_5fexact_5fcuda_96',['split_embedding_backward_codegen_none_unweighted_exact_cuda',['../gen__embedding__backward__none__split__unweighted__cuda_8cu.html#ac780b945eb2c0cff713ff7280122da42',1,'split_embedding_backward_codegen_none_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__none_8cpp.html#ab8077c80baaf216fec8c7c0c81cd0c29',1,'split_embedding_backward_codegen_none_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, int64_t total_hash_size=0, int64_t total_unique_indices=0): gen_embedding_backward_none_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fnone_5fweighted_5fexact_5fcuda_97',['split_embedding_backward_codegen_none_weighted_exact_cuda',['../gen__embedding__backward__none__split__weighted__cuda_8cu.html#a12b41a32a38b812420382dfb33a09e17',1,'split_embedding_backward_codegen_none_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_cuda.cu'],['../gen__embedding__backward__split__none_8cpp.html#a7808efa8b7d1caa4534528c97b55a26b',1,'split_embedding_backward_codegen_none_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, int64_t total_hash_size=0, int64_t total_unique_indices=0): gen_embedding_backward_none_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fpartial_5frowwise_5fadam_5funweighted_5fexact_5fcuda_98',['split_embedding_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda',['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__cuda_8cu.html#a4d39b6b803c05c33caf58b4a2fbf37ac',1,'split_embedding_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html#a3d1da3b63c8a16884d3de8d52c0b99fd',1,'split_embedding_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_partial_rowwise_adam_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fpartial_5frowwise_5fadam_5fweighted_5fexact_5fcuda_99',['split_embedding_backward_codegen_partial_rowwise_adam_weighted_exact_cuda',['../gen__embedding__backward__partial__rowwise__adam__split__weighted__cuda_8cu.html#ac295880f03c86cb263b324158e460e82',1,'split_embedding_backward_codegen_partial_rowwise_adam_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html#aaa1e9d0adf68022fa575a63182a95745',1,'split_embedding_backward_codegen_partial_rowwise_adam_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_partial_rowwise_adam_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fpartial_5frowwise_5flamb_5funweighted_5fexact_5fcuda_100',['split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda',['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__cuda_8cu.html#a561ce0f1da43ca47001db85a395203e1',1,'split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html#ab047f1b46e810b2a48f66387d37cd588',1,'split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fpartial_5frowwise_5flamb_5fweighted_5fexact_5fcuda_101',['split_embedding_backward_codegen_partial_rowwise_lamb_weighted_exact_cuda',['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__cuda_8cu.html#a70ac0537228900edc94bbd437c550a15',1,'split_embedding_backward_codegen_partial_rowwise_lamb_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html#a6619694897abaeee44b975fa9614d7e3',1,'split_embedding_backward_codegen_partial_rowwise_lamb_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_partial_rowwise_lamb_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5fcpu_102',['split_embedding_backward_codegen_rowwise_adagrad_cpu',['../gen__embedding__backward__rowwise__adagrad__split__cpu_8cpp.html#a73c1fd212c2c324e57b0f906a2598360',1,'split_embedding_backward_codegen_rowwise_adagrad_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, Tensor momentum1_host, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0, double max_norm=0.0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_rowwise_adagrad_split_cpu.cpp'],['../gen__embedding__backward__split__rowwise__adagrad__cpu_8cpp.html#a73c1fd212c2c324e57b0f906a2598360',1,'split_embedding_backward_codegen_rowwise_adagrad_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, Tensor momentum1_host, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0, double max_norm=0.0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_rowwise_adagrad_split_cpu.cpp']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5funweighted_5fexact_5fcuda_103',['split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_cuda',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__cuda_8cu.html#aca4e3268cb308c63a299f50cde66dec1',1,'split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode, double max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#ad73707297535524e1eeff86f23adfdfa',1,'split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0, double max_norm=0.0): gen_embedding_backward_rowwise_adagrad_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5funweighted_5fexact_5fvbe_5fcuda_104',['split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_vbe_cuda',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__cuda_8cu.html#a85ffab9880f2b1221f86a7f63c088096',1,'split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const Tensor &B_offsets, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode, double max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#ae52a1e89225c55716b2505ef0b14b32c',1,'split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const Tensor &B_offsets, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0, double max_norm=0.0): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5fweighted_5fexact_5fcuda_105',['split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_cuda',['../gen__embedding__backward__rowwise__adagrad__split__weighted__cuda_8cu.html#a9e02b82c5db58357a98bc86454c2d7a5',1,'split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode, double max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#a394f3f0a5cbe256e703c0bb34bfe50b3',1,'split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0, double max_norm=0.0): gen_embedding_backward_rowwise_adagrad_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5fweighted_5fexact_5fvbe_5fcuda_106',['split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_vbe_cuda',['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__cuda_8cu.html#a0266589d7dcf9f22a9398090ae16abac',1,'split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const Tensor &B_offsets, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode, double max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#af257dbbdb6a2c64fdb2e038bb39190c1',1,'split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const Tensor &B_offsets, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0, double max_norm=0.0): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5fwith_5fcounter_5fcpu_107',['split_embedding_backward_codegen_rowwise_adagrad_with_counter_cpu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__cpu_8cpp.html#a2e90723fcad83f3054bc6f661de849c1',1,'split_embedding_backward_codegen_rowwise_adagrad_with_counter_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, Tensor momentum1_host, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_host, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_host, Tensor row_counter_placements, Tensor row_counter_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t iter=0, int64_t counter_halflife=-1, int64_t adjustment_iter=-1, double adjustment_ub=1.0, int64_t learning_rate_mode=-1, int64_t weight_decay_mode=1, int64_t grad_sum_decay=-1, double max_counter=0, double tail_id_threshold=0.0, int64_t is_tail_id_thresh_ratio=0, int64_t regularization_mode=0, double weight_norm_coefficient=0.0, double lower_bound=0.0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_rowwise_adagrad_with_counter_split_cpu.cpp'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter__cpu_8cpp.html#a2e90723fcad83f3054bc6f661de849c1',1,'split_embedding_backward_codegen_rowwise_adagrad_with_counter_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, Tensor momentum1_host, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_host, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_host, Tensor row_counter_placements, Tensor row_counter_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t iter=0, int64_t counter_halflife=-1, int64_t adjustment_iter=-1, double adjustment_ub=1.0, int64_t learning_rate_mode=-1, int64_t weight_decay_mode=1, int64_t grad_sum_decay=-1, double max_counter=0, double tail_id_threshold=0.0, int64_t is_tail_id_thresh_ratio=0, int64_t regularization_mode=0, double weight_norm_coefficient=0.0, double lower_bound=0.0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_rowwise_adagrad_with_counter_split_cpu.cpp']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5fwith_5fcounter_5funweighted_5fexact_5fcuda_108',['split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__cuda_8cu.html#afa64170f02313b2766c2cc3e25d2f5a9',1,'split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_dev, Tensor prev_iter_uvm, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_dev, Tensor row_counter_uvm, Tensor row_counter_placements, Tensor row_counter_offsets, double eps, double learning_rate, double weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, double adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, double max_counter, double tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, double weight_norm_coefficient, double lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html#aea7503341318b3b0142a83d310046516',1,'split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_dev, Tensor prev_iter_uvm, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_dev, Tensor row_counter_uvm, Tensor row_counter_placements, Tensor row_counter_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t iter=0, int64_t counter_halflife=-1, int64_t adjustment_iter=-1, double adjustment_ub=1.0, int64_t learning_rate_mode=-1, int64_t weight_decay_mode=1, int64_t grad_sum_decay=-1, double max_counter=0, double tail_id_threshold=0.0, int64_t is_tail_id_thresh_ratio=0, int64_t regularization_mode=0, double weight_norm_coefficient=0.0, double lower_bound=0.0): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5fwith_5fcounter_5fweighted_5fexact_5fcuda_109',['split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_exact_cuda',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__cuda_8cu.html#aeed29f5cd2c5bacfd4ed37b2381c128b',1,'split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_dev, Tensor prev_iter_uvm, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_dev, Tensor row_counter_uvm, Tensor row_counter_placements, Tensor row_counter_offsets, double eps, double learning_rate, double weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, double adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, double max_counter, double tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, double weight_norm_coefficient, double lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html#a07c978ecc3495651d0123d01876f68ca',1,'split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_dev, Tensor prev_iter_uvm, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_dev, Tensor row_counter_uvm, Tensor row_counter_placements, Tensor row_counter_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t iter=0, int64_t counter_halflife=-1, int64_t adjustment_iter=-1, double adjustment_ub=1.0, int64_t learning_rate_mode=-1, int64_t weight_decay_mode=1, int64_t grad_sum_decay=-1, double max_counter=0, double tail_id_threshold=0.0, int64_t is_tail_id_thresh_ratio=0, int64_t regularization_mode=0, double weight_norm_coefficient=0.0, double lower_bound=0.0): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5funweighted_5fexact_5fcuda_110',['split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html#acc9cd7c72b1624ec0df8d9f4edbde2cb',1,'split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html#a9f5e043a0a43d92b7a748c27e6ce8060',1,'split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fweighted_5fexact_5fcuda_111',['split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_exact_cuda',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html#a969bc368ad46c57ab47feac737df5001',1,'split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html#a6cb23330ccfc55cc78d828d1fd8b59fb',1,'split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fweighted_5fadagrad_5fcpu_112',['split_embedding_backward_codegen_rowwise_weighted_adagrad_cpu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__cpu_8cpp.html#acb5592b9d0b5b9344302f69c0f1be10b',1,'split_embedding_backward_codegen_rowwise_weighted_adagrad_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, Tensor momentum1_host, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0, int64_t iter=0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_rowwise_weighted_adagrad_split_cpu.cpp'],['../gen__embedding__backward__split__rowwise__weighted__adagrad__cpu_8cpp.html#acb5592b9d0b5b9344302f69c0f1be10b',1,'split_embedding_backward_codegen_rowwise_weighted_adagrad_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, Tensor momentum1_host, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0, int64_t iter=0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_rowwise_weighted_adagrad_split_cpu.cpp']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fweighted_5fadagrad_5funweighted_5fexact_5fcuda_113',['split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda',['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__cuda_8cu.html#a10025996061290114d6060505057ce7b',1,'split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html#a3a9f041d93d95908fbe76052c3d48a3e',1,'split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fweighted_5fadagrad_5fweighted_5fexact_5fcuda_114',['split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_exact_cuda',['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__cuda_8cu.html#a74ae14449034e73352a950be7faee8cd',1,'split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html#aad0ff2a4c042997b9969d779d3c91c59',1,'split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fsgd_5fcpu_115',['split_embedding_backward_codegen_sgd_cpu',['../gen__embedding__backward__sgd__split__cpu_8cpp.html#a9d914bb02aed97803fcc9237f00403fa',1,'split_embedding_backward_codegen_sgd_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, double learning_rate=0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_sgd_split_cpu.cpp'],['../gen__embedding__backward__split__sgd__cpu_8cpp.html#a9d914bb02aed97803fcc9237f00403fa',1,'split_embedding_backward_codegen_sgd_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, double learning_rate=0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_sgd_split_cpu.cpp']]], + ['split_5fembedding_5fbackward_5fcodegen_5fsgd_5funweighted_5fexact_5fcuda_116',['split_embedding_backward_codegen_sgd_unweighted_exact_cuda',['../gen__embedding__backward__sgd__split__unweighted__cuda_8cu.html#ad2d75e84d796d6d8fae77c19e7a8af3b',1,'split_embedding_backward_codegen_sgd_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate): gen_embedding_backward_sgd_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#a8f7618b0f318fed552700a9303e0c500',1,'split_embedding_backward_codegen_sgd_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate=0): gen_embedding_backward_sgd_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fsgd_5funweighted_5fexact_5fvbe_5fcuda_117',['split_embedding_backward_codegen_sgd_unweighted_exact_vbe_cuda',['../gen__embedding__backward__sgd__split__unweighted__vbe__cuda_8cu.html#a216acb29a8d546146f5593b5abd7eaa1',1,'split_embedding_backward_codegen_sgd_unweighted_exact_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const Tensor &B_offsets, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#aa75d2899ee39c0d5f71e426d1cc7d57c',1,'split_embedding_backward_codegen_sgd_unweighted_exact_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const Tensor &B_offsets, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate=0): gen_embedding_backward_sgd_split_unweighted_vbe_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fsgd_5fweighted_5fexact_5fcuda_118',['split_embedding_backward_codegen_sgd_weighted_exact_cuda',['../gen__embedding__backward__sgd__split__weighted__cuda_8cu.html#a16ec895b54d5b04f3fdfa67930c1c02a',1,'split_embedding_backward_codegen_sgd_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate): gen_embedding_backward_sgd_split_weighted_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#a2934aefc05b7ad4bc6e07074f0a2ee1e',1,'split_embedding_backward_codegen_sgd_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate=0): gen_embedding_backward_sgd_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fsgd_5fweighted_5fexact_5fvbe_5fcuda_119',['split_embedding_backward_codegen_sgd_weighted_exact_vbe_cuda',['../gen__embedding__backward__sgd__split__weighted__vbe__cuda_8cu.html#a9ee8617b61b6a4be1391fe53321bf927',1,'split_embedding_backward_codegen_sgd_weighted_exact_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const Tensor &B_offsets, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#a67f194387a7e81de22d969964f1cc379',1,'split_embedding_backward_codegen_sgd_weighted_exact_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const Tensor &B_offsets, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate=0): gen_embedding_backward_sgd_split_weighted_vbe_cuda.cu']]], + ['split_5fembedding_5fcodegen_5fforward_5fcpu_120',['split_embedding_codegen_forward_cpu',['../embedding__forward__split__cpu_8cpp.html#aaf201bc6f5c8deb12999a3eff03cf7bb',1,'split_embedding_codegen_forward_cpu(Tensor weights, Tensor weights_offsets, Tensor D_offsets, int64_t total_D, Tensor hash_size_cumsum, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, int64_t output_dtype): embedding_forward_split_cpu.cpp'],['../embedding__forward__split__cpu_8h.html#a01e2ccf0c687aa129f511c048dd878a2',1,'split_embedding_codegen_forward_cpu(at::Tensor weights, at::Tensor weights_offsets, at::Tensor D_offsets, int64_t total_D, at::Tensor hash_size_cumsum, at::Tensor indices, at::Tensor offsets, int64_t pooling_mode, at::Tensor indice_weights, int64_t output_dtype=0): embedding_forward_split_cpu.cpp']]], + ['split_5fembedding_5fcodegen_5fforward_5fcpu_5fmeta_121',['split_embedding_codegen_forward_cpu_meta',['../embedding__forward__split__cpu_8cpp.html#a0641f4b915d503586cb2d251029169e4',1,'embedding_forward_split_cpu.cpp']]], + ['split_5fembedding_5fcodegen_5fforward_5funweighted_5fcuda_122',['split_embedding_codegen_forward_unweighted_cuda',['../gen__embedding__backward__split__adagrad_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__adam_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__lamb_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__lars__sgd_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__none_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu']]], + ['split_5fembedding_5fcodegen_5fforward_5funweighted_5fmeta_123',['split_embedding_codegen_forward_unweighted_meta',['../gen__embedding__forward__split__unweighted__codegen__meta_8cpp.html#a2b7fe88621ffc9b8dc0b55efafb6cb83',1,'gen_embedding_forward_split_unweighted_codegen_meta.cpp']]], + ['split_5fembedding_5fcodegen_5fforward_5funweighted_5fvbe_5fcuda_124',['split_embedding_codegen_forward_unweighted_vbe_cuda',['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#a028ac1d276dc02b3db5e9195eea165f3',1,'split_embedding_codegen_forward_unweighted_vbe_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const int64_t vbe_output_size, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool is_experimental): gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#a028ac1d276dc02b3db5e9195eea165f3',1,'split_embedding_codegen_forward_unweighted_vbe_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const int64_t vbe_output_size, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool is_experimental): gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#a028ac1d276dc02b3db5e9195eea165f3',1,'split_embedding_codegen_forward_unweighted_vbe_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const int64_t vbe_output_size, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool is_experimental): gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu']]], + ['split_5fembedding_5fcodegen_5fforward_5funweighted_5fvbe_5fmeta_125',['split_embedding_codegen_forward_unweighted_vbe_meta',['../gen__embedding__forward__split__unweighted__vbe__codegen__meta_8cpp.html#ac45ac774af2f2cdc3ef15fccacbc9866',1,'gen_embedding_forward_split_unweighted_vbe_codegen_meta.cpp']]], + ['split_5fembedding_5fcodegen_5fforward_5fweighted_5fcuda_126',['split_embedding_codegen_forward_weighted_cuda',['../gen__embedding__backward__split__adagrad_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__adam_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__lamb_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__lars__sgd_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__none_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu']]], + ['split_5fembedding_5fcodegen_5fforward_5fweighted_5fmeta_127',['split_embedding_codegen_forward_weighted_meta',['../gen__embedding__forward__split__weighted__codegen__meta_8cpp.html#a3f1b063bf337baa7c85cd891f50dcb17',1,'gen_embedding_forward_split_weighted_codegen_meta.cpp']]], + ['split_5fembedding_5fcodegen_5fforward_5fweighted_5fvbe_5fcuda_128',['split_embedding_codegen_forward_weighted_vbe_cuda',['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#a0b7f13ed95640b7a8e42d3f0ff3f2b46',1,'split_embedding_codegen_forward_weighted_vbe_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const int64_t vbe_output_size, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool is_experimental): gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#a0b7f13ed95640b7a8e42d3f0ff3f2b46',1,'split_embedding_codegen_forward_weighted_vbe_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const int64_t vbe_output_size, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool is_experimental): gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#a0b7f13ed95640b7a8e42d3f0ff3f2b46',1,'split_embedding_codegen_forward_weighted_vbe_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const int64_t vbe_output_size, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool is_experimental): gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu']]], + ['split_5fembedding_5fcodegen_5fforward_5fweighted_5fvbe_5fmeta_129',['split_embedding_codegen_forward_weighted_vbe_meta',['../gen__embedding__forward__split__weighted__vbe__codegen__meta_8cpp.html#aafe550801c2d2c26cf43ccef3a6ac0e9',1,'gen_embedding_forward_split_weighted_vbe_codegen_meta.cpp']]], + ['split_5fembedding_5fcodegen_5fgrad_5findice_5fweights_5fcpu_130',['split_embedding_codegen_grad_indice_weights_cpu',['../embedding__forward__split__cpu_8cpp.html#a03b54fa4944d00f3984442a980742701',1,'split_embedding_codegen_grad_indice_weights_cpu(Tensor grad_output, Tensor weights, Tensor weights_offsets, Tensor D_offsets, Tensor indices, Tensor offsets, Tensor feature_requires_grad): embedding_forward_split_cpu.cpp'],['../embedding__forward__split__cpu_8h.html#a371a7887c9af52b22bdc10e84d5c2ba6',1,'split_embedding_codegen_grad_indice_weights_cpu(at::Tensor grad_output, at::Tensor weights, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor indices, at::Tensor offsets, at::Tensor feature_requires_grad): embedding_forward_split_cpu.cpp']]], + ['split_5fembedding_5fcodegen_5fgrad_5findice_5fweights_5fcuda_131',['split_embedding_codegen_grad_indice_weights_cuda',['../gen__embedding__backward__split__adagrad_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__adam_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__indice__weights__codegen__cuda_8cu.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__lamb_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__lars__sgd_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__none_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu']]], + ['split_5fembedding_5fcodegen_5fgrad_5findice_5fweights_5fvbe_5fcuda_132',['split_embedding_codegen_grad_indice_weights_vbe_cuda',['../gen__embedding__backward__split__indice__weights__codegen__cuda_8cu.html#ae8a987f07ba5142ffd7a0733824925a2',1,'split_embedding_codegen_grad_indice_weights_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const int64_t info_B_num_bits, const int64_t info_B_mask_int64): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#ae8a987f07ba5142ffd7a0733824925a2',1,'split_embedding_codegen_grad_indice_weights_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const int64_t info_B_num_bits, const int64_t info_B_mask_int64): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#ae8a987f07ba5142ffd7a0733824925a2',1,'split_embedding_codegen_grad_indice_weights_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const int64_t info_B_num_bits, const int64_t info_B_mask_int64): gen_embedding_backward_split_indice_weights_codegen_cuda.cu']]], + ['split_5fembedding_5fcodegen_5flookup_5fadagrad_5ffunction_133',['split_embedding_codegen_lookup_adagrad_function',['../group__embedding-cuda.html#gaa0988eef90f8662e8886912ed3784c1d',1,'gen_embedding_backward_split_adagrad.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fadam_5ffunction_134',['split_embedding_codegen_lookup_adam_function',['../group__embedding-cuda.html#ga639ddbb31e9d565bfcfa4766b14c9ef6',1,'gen_embedding_backward_split_adam.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fapprox_5frowwise_5fadagrad_5ffunction_135',['split_embedding_codegen_lookup_approx_rowwise_adagrad_function',['../group__embedding-cuda.html#gac847393d811e7b22ace39ff91eb91e27',1,'gen_embedding_backward_split_approx_rowwise_adagrad.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fapprox_5frowwise_5fadagrad_5fwith_5fcounter_5ffunction_136',['split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function',['../group__embedding-cuda.html#gabf7587752fb66934350cec59cd7adda9',1,'gen_embedding_backward_split_approx_rowwise_adagrad_with_counter.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5ffunction_137',['split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function',['../group__embedding-cuda.html#ga0a7191adb6807417bfaab85ccb6fac50',1,'gen_embedding_backward_split_approx_rowwise_adagrad_with_weight_decay.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fapprox_5fsgd_5ffunction_138',['split_embedding_codegen_lookup_approx_sgd_function',['../group__embedding-cuda.html#gabcff81381942478b57805e5deb7725fb',1,'gen_embedding_backward_split_approx_sgd.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fdense_5ffunction_139',['split_embedding_codegen_lookup_dense_function',['../embedding__backward__dense__host_8cpp.html#a04b7d97e6fd0bbb6e9877db0c1b7e506',1,'embedding_backward_dense_host.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5flamb_5ffunction_140',['split_embedding_codegen_lookup_lamb_function',['../group__embedding-cuda.html#ga1c377dd2500d38974bbfe0e69243e084',1,'gen_embedding_backward_split_lamb.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5flars_5fsgd_5ffunction_141',['split_embedding_codegen_lookup_lars_sgd_function',['../group__embedding-cuda.html#ga5c0d733a2e781ea4c9fc5ab3a2d6ccf3',1,'gen_embedding_backward_split_lars_sgd.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fnone_5ffunction_142',['split_embedding_codegen_lookup_none_function',['../group__embedding-cuda.html#ga855a30b389de5a61097f44cff795b6c3',1,'gen_embedding_backward_split_none.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fpartial_5frowwise_5fadam_5ffunction_143',['split_embedding_codegen_lookup_partial_rowwise_adam_function',['../group__embedding-cuda.html#ga06feb6c425fba7c460dc0da550d4e4e6',1,'gen_embedding_backward_split_partial_rowwise_adam.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fpartial_5frowwise_5flamb_5ffunction_144',['split_embedding_codegen_lookup_partial_rowwise_lamb_function',['../group__embedding-cuda.html#ga37b9129c928c9cb39459198f36f11c8d',1,'gen_embedding_backward_split_partial_rowwise_lamb.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5frowwise_5fadagrad_5ffunction_145',['split_embedding_codegen_lookup_rowwise_adagrad_function',['../group__embedding-cuda.html#gacc3d997b675b747985dd37193cac4edd',1,'gen_embedding_backward_split_rowwise_adagrad.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5frowwise_5fadagrad_5fwith_5fcounter_5ffunction_146',['split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function',['../group__embedding-cuda.html#ga917cf0c2c4487425408808529ed05e68',1,'gen_embedding_backward_split_rowwise_adagrad_with_counter.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5ffunction_147',['split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function',['../group__embedding-cuda.html#ga2e19021f546871ef6f1e57fca7cf5e13',1,'gen_embedding_backward_split_rowwise_adagrad_with_weight_decay.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5frowwise_5fweighted_5fadagrad_5ffunction_148',['split_embedding_codegen_lookup_rowwise_weighted_adagrad_function',['../group__embedding-cuda.html#ga54a40e0e64a528731d45bca998727a1c',1,'gen_embedding_backward_split_rowwise_weighted_adagrad.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fsgd_5ffunction_149',['split_embedding_codegen_lookup_sgd_function',['../group__embedding-cuda.html#ga66c2eb0df8e5dab40f0d862ebe43bd34',1,'gen_embedding_backward_split_sgd.cpp']]], + ['split_5fembedding_5fforward_5fcpu_5fkernel_150',['split_embedding_forward_cpu_kernel',['../embedding__forward__split__cpu_8cpp.html#af360a949beb9bba72466614e220da13d',1,'embedding_forward_split_cpu.cpp']]], + ['split_5fembedding_5fgrad_5findice_5fweights_5fcpu_5fkernel_151',['split_embedding_grad_indice_weights_cpu_kernel',['../embedding__forward__split__cpu_8cpp.html#a1156d3aee8ccb8a6676b22f78fe0829c',1,'embedding_forward_split_cpu.cpp']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5fadagrad_5funweighted_5fexact_5fcuda_152',['split_embedding_nobag_backward_codegen_adagrad_unweighted_exact_cuda',['../gen__embedding__backward__adagrad__split__unweighted__nobag__cuda_8cu.html#a635c3123249dcf767e8d80923e11a7b1',1,'split_embedding_nobag_backward_codegen_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__adagrad_8cpp.html#ad491e078738cfd46a4d2377948b977fc',1,'split_embedding_nobag_backward_codegen_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0): gen_embedding_backward_adagrad_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5fadam_5funweighted_5fexact_5fcuda_153',['split_embedding_nobag_backward_codegen_adam_unweighted_exact_cuda',['../gen__embedding__backward__adam__split__unweighted__nobag__cuda_8cu.html#a6392bb8bf8131572a96cb5bf5a363152',1,'split_embedding_nobag_backward_codegen_adam_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__adam_8cpp.html#a6a9de0e9036f30dbd7d7e4442ae7e5fe',1,'split_embedding_nobag_backward_codegen_adam_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_adam_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5funweighted_5fexact_5fcuda_154',['split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html#a8d755844b3dc430390b0db02833650a7',1,'split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html#a701f363d76409a2aa4df028f12ba0300',1,'split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5fdense_5funweighted_5fexact_5fcuda_155',['split_embedding_nobag_backward_codegen_dense_unweighted_exact_cuda',['../gen__embedding__backward__dense__split__unweighted__nobag__cuda_8cu.html#a11ce1782edb9d58fffb5fe2581172d70',1,'split_embedding_nobag_backward_codegen_dense_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t unused_, const int64_t max_segment_length_per_warp, double unused): gen_embedding_backward_dense_split_unweighted_nobag_cuda.cu'],['../embedding__backward__dense__host_8cpp.html#a7911ad2a461036b977d8d9f9fafb391a',1,'split_embedding_nobag_backward_codegen_dense_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const double unused): gen_embedding_backward_dense_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5flamb_5funweighted_5fexact_5fcuda_156',['split_embedding_nobag_backward_codegen_lamb_unweighted_exact_cuda',['../gen__embedding__backward__lamb__split__unweighted__nobag__cuda_8cu.html#ad6463435db98705077041803b394dcc3',1,'split_embedding_nobag_backward_codegen_lamb_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__lamb_8cpp.html#a557b019964c8d292ca9923927e0d974a',1,'split_embedding_nobag_backward_codegen_lamb_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_lamb_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5flars_5fsgd_5funweighted_5fexact_5fcuda_157',['split_embedding_nobag_backward_codegen_lars_sgd_unweighted_exact_cuda',['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__cuda_8cu.html#a0e0807f32e264e5a83586907ea3f6749',1,'split_embedding_nobag_backward_codegen_lars_sgd_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double learning_rate, double eta, double momentum, double weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__lars__sgd_8cpp.html#a80df1bf7d746582f689d6bc4652f7266',1,'split_embedding_nobag_backward_codegen_lars_sgd_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double learning_rate=0, double eta=0, double momentum=0, double weight_decay=0): gen_embedding_backward_lars_sgd_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5fnone_5funweighted_5fexact_5fcuda_158',['split_embedding_nobag_backward_codegen_none_unweighted_exact_cuda',['../gen__embedding__backward__none__split__unweighted__nobag__cuda_8cu.html#af181f8da92e59fb5da465d0931859e77',1,'split_embedding_nobag_backward_codegen_none_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__none_8cpp.html#a1540203f5279dd87016b397fe33fb041',1,'split_embedding_nobag_backward_codegen_none_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, int64_t total_hash_size=0, int64_t total_unique_indices=0): gen_embedding_backward_none_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5fpartial_5frowwise_5fadam_5funweighted_5fexact_5fcuda_159',['split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda',['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__cuda_8cu.html#a96e4c395674727814da03c2e1654487b',1,'split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html#aed21b16681b11ddd3303195bc4e278ec',1,'split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5fpartial_5frowwise_5flamb_5funweighted_5fexact_5fcuda_160',['split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda',['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__cuda_8cu.html#a41c428effc52b315649ebd4bda728619',1,'split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html#ad14c41705ba6da0dc89b8802945b9a3a',1,'split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5frowwise_5fadagrad_5funweighted_5fexact_5fcuda_161',['split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_exact_cuda',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__cuda_8cu.html#a05fd1c9f2aea152f9cbe2def957c66fb',1,'split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode, double max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#a4bdf992307f845985594c371275668a8',1,'split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0, double max_norm=0.0): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5frowwise_5fadagrad_5fwith_5fcounter_5funweighted_5fexact_5fcuda_162',['split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__cuda_8cu.html#a0377d50ef90391567b4819a19bffb34c',1,'split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_dev, Tensor prev_iter_uvm, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_dev, Tensor row_counter_uvm, Tensor row_counter_placements, Tensor row_counter_offsets, double eps, double learning_rate, double weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, double adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, double max_counter, double tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, double weight_norm_coefficient, double lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html#a7a94588a2cce7c8cad5f1654d5724ea3',1,'split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_dev, Tensor prev_iter_uvm, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_dev, Tensor row_counter_uvm, Tensor row_counter_placements, Tensor row_counter_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t iter=0, int64_t counter_halflife=-1, int64_t adjustment_iter=-1, double adjustment_ub=1.0, int64_t learning_rate_mode=-1, int64_t weight_decay_mode=1, int64_t grad_sum_decay=-1, double max_counter=0, double tail_id_threshold=0.0, int64_t is_tail_id_thresh_ratio=0, int64_t regularization_mode=0, double weight_norm_coefficient=0.0, double lower_bound=0.0): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5funweighted_5fexact_5fcuda_163',['split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html#af25017968213662e5c8c0ab9f5fa7e9a',1,'split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html#a31dd9b41f6ea038416e54092a7fcb594',1,'split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5frowwise_5fweighted_5fadagrad_5funweighted_5fexact_5fcuda_164',['split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda',['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__cuda_8cu.html#a42435ea3b63f42213a2c24d4aadc84f6',1,'split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html#aaf57ee3cb4514d7ccec1c0f5bd653ed3',1,'split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5fsgd_5funweighted_5fexact_5fcuda_165',['split_embedding_nobag_backward_codegen_sgd_unweighted_exact_cuda',['../gen__embedding__backward__sgd__split__unweighted__nobag__cuda_8cu.html#a2260d3e46945437faae7a44fe015bf7c',1,'split_embedding_nobag_backward_codegen_sgd_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#ad92e69305915e46befca51e7288b428b',1,'split_embedding_nobag_backward_codegen_sgd_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate=0): gen_embedding_backward_sgd_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fcodegen_5fforward_5funweighted_5fcuda_166',['split_embedding_nobag_codegen_forward_unweighted_cuda',['../gen__embedding__backward__split__adagrad_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__adam_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__lamb_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__lars__sgd_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__none_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu']]], + ['split_5fembedding_5fnobag_5fcodegen_5fforward_5funweighted_5fmeta_167',['split_embedding_nobag_codegen_forward_unweighted_meta',['../gen__embedding__forward__split__unweighted__codegen__meta_8cpp.html#a580b1b950402848a3c71d7092a69ceb7',1,'gen_embedding_forward_split_unweighted_codegen_meta.cpp']]], + ['split_5fembedding_5frowwise_5fadagrad_5fupdate_168',['split_embedding_rowwise_adagrad_update',['../gen__embedding__optimizer__rowwise__adagrad__split_8cpp.html#a21a7b48ff9760f1aa13e260de4b7d2a9',1,'split_embedding_rowwise_adagrad_update(Tensor &dev_weights, Tensor &uvm_weights, Tensor &lxu_cache_weights, const Tensor &grad_dev_weights, const Tensor &grad_dev_indices, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t max_D, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0, double max_norm=0.0): gen_embedding_optimizer_rowwise_adagrad_split.cpp'],['../gen__embedding__optimizer__rowwise__adagrad__split__cuda_8cu.html#ab369ffc9f9e69eca82b24131247ecfcf',1,'split_embedding_rowwise_adagrad_update(Tensor &dev_weights, Tensor &uvm_weights, Tensor &lxu_cache_weights, const Tensor &grad_dev_weights, const Tensor &grad_dev_indices, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t max_D, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0, double max_norm=0.0): gen_embedding_optimizer_rowwise_adagrad_split_cuda.cu']]], + ['split_5fembeddings_5fcache_5fcuda_2ecuh_169',['split_embeddings_cache_cuda.cuh',['../split__embeddings__cache__cuda_8cuh.html',1,'']]], + ['split_5fembeddings_5fcache_5fops_2ecpp_170',['split_embeddings_cache_ops.cpp',['../split__embeddings__cache__ops_8cpp.html',1,'']]], + ['split_5fembeddings_5fcache_5fops_2ecu_171',['split_embeddings_cache_ops.cu',['../split__embeddings__cache__ops_8cu.html',1,'']]], + ['split_5fembeddings_5futils_2ecpp_172',['split_embeddings_utils.cpp',['../split__embeddings__utils_8cpp.html',1,'']]], + ['split_5fembeddings_5futils_2ecuh_173',['split_embeddings_utils.cuh',['../split__embeddings__utils_8cuh.html',1,'']]], + ['split_5flamb_5ftable_5fupdate_5fkernel_174',['split_lamb_table_update_kernel',['../gen__embedding__optimizer__lamb__split__device__kernel_8cuh.html#a2952f72a1e3f88f38246d2954dbee2b1',1,'gen_embedding_optimizer_lamb_split_device_kernel.cuh']]], + ['split_5flars_5fsgd_5ftable_5fupdate_5fkernel_175',['split_lars_sgd_table_update_kernel',['../gen__embedding__optimizer__lars__sgd__split__device__kernel_8cuh.html#af488b727a53946064f329ad042bbf73a',1,'gen_embedding_optimizer_lars_sgd_split_device_kernel.cuh']]], + ['split_5fnone_5ftable_5fupdate_5fkernel_176',['split_none_table_update_kernel',['../gen__embedding__optimizer__none__split__device__kernel_8cuh.html#a2cb53295ff111df7a98fbc7573469c61',1,'gen_embedding_optimizer_none_split_device_kernel.cuh']]], + ['split_5fpartial_5frowwise_5fadam_5ftable_5fupdate_5fkernel_177',['split_partial_rowwise_adam_table_update_kernel',['../gen__embedding__optimizer__partial__rowwise__adam__split__device__kernel_8cuh.html#a278aedfb9f50b7f5486dbc97e87cab8e',1,'gen_embedding_optimizer_partial_rowwise_adam_split_device_kernel.cuh']]], + ['split_5fpartial_5frowwise_5flamb_5ftable_5fupdate_5fkernel_178',['split_partial_rowwise_lamb_table_update_kernel',['../gen__embedding__optimizer__partial__rowwise__lamb__split__device__kernel_8cuh.html#a950ea306504584d6cc2050caf007295c',1,'gen_embedding_optimizer_partial_rowwise_lamb_split_device_kernel.cuh']]], + ['split_5frowwise_5fadagrad_5ftable_5fupdate_5fkernel_179',['split_rowwise_adagrad_table_update_kernel',['../gen__embedding__optimizer__rowwise__adagrad__split__device__kernel_8cuh.html#aab5a925ed0316c38c00fcce3b1adc50a',1,'gen_embedding_optimizer_rowwise_adagrad_split_device_kernel.cuh']]], + ['split_5frowwise_5fadagrad_5fwith_5fcounter_5ftable_5fupdate_5fkernel_180',['split_rowwise_adagrad_with_counter_table_update_kernel',['../gen__embedding__optimizer__rowwise__adagrad__with__counter__split__device__kernel_8cuh.html#aa7708111891a0d2eeeda7881715427bb',1,'gen_embedding_optimizer_rowwise_adagrad_with_counter_split_device_kernel.cuh']]], + ['split_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5ftable_5fupdate_5fkernel_181',['split_rowwise_adagrad_with_weight_decay_table_update_kernel',['../gen__embedding__optimizer__rowwise__adagrad__with__weight__decay__split__device__kernel_8cuh.html#ae265a93446a3c4665e857bc8b2f7d8d7',1,'gen_embedding_optimizer_rowwise_adagrad_with_weight_decay_split_device_kernel.cuh']]], + ['split_5frowwise_5fweighted_5fadagrad_5ftable_5fupdate_5fkernel_182',['split_rowwise_weighted_adagrad_table_update_kernel',['../gen__embedding__optimizer__rowwise__weighted__adagrad__split__device__kernel_8cuh.html#a54b1af3a7b8db5fce48d934e47656c50',1,'gen_embedding_optimizer_rowwise_weighted_adagrad_split_device_kernel.cuh']]], + ['split_5fsgd_5ftable_5fupdate_5fkernel_183',['split_sgd_table_update_kernel',['../gen__embedding__optimizer__sgd__split__device__kernel_8cuh.html#ab768e225fdd76b64ab5c9114ed3cc7cc',1,'gen_embedding_optimizer_sgd_split_device_kernel.cuh']]], + ['splitmix64_5fstateless_184',['splitmix64_stateless',['../namespacefbgemm__gpu.html#aa5ada0472a8306dea17df0d7d1d42abc',1,'fbgemm_gpu']]], + ['src_5fidx_185',['src_idx',['../namespacefbgemm__gpu.html#a119724f55ff744b85a20a870b5da4152',1,'fbgemm_gpu']]], + ['ssd_186',['ssd',['../namespacessd.html',1,'']]], + ['ssd_5fcache_5fpopulate_5factions_5fcuda_187',['ssd_cache_populate_actions_cuda',['../ssd__split__embeddings__cache__cuda_8cu.html#a872136033719ff00d6b05e94e4b1cbab',1,'ssd_cache_populate_actions_cuda(Tensor linear_indices, int64_t total_hash_size, Tensor lxu_cache_state, int64_t time_stamp, int64_t prefetch_dist, Tensor lru_state): ssd_split_embeddings_cache_cuda.cu'],['../ssd__split__table__batched__embeddings_8cpp.html#a872136033719ff00d6b05e94e4b1cbab',1,'ssd_cache_populate_actions_cuda(Tensor linear_indices, int64_t total_hash_size, Tensor lxu_cache_state, int64_t time_stamp, int64_t prefetch_dist, Tensor lru_state): ssd_split_embeddings_cache_cuda.cu']]], + ['ssd_5fsplit_5fembeddings_5fcache_5fcuda_2ecu_188',['ssd_split_embeddings_cache_cuda.cu',['../ssd__split__embeddings__cache__cuda_8cu.html',1,'']]], + ['ssd_5fsplit_5ftable_5fbatched_5fembeddings_2ecpp_189',['ssd_split_table_batched_embeddings.cpp',['../ssd__split__table__batched__embeddings_8cpp.html',1,'']]], + ['ssd_5ftable_5fbatched_5fembeddings_2eh_190',['ssd_table_batched_embeddings.h',['../ssd__table__batched__embeddings_8h.html',1,'']]], + ['stackarray_191',['StackArray',['../struct_stack_array.html',1,'']]], + ['stacked_5fjagged_5f1d_5fto_5fdense_2ecu_192',['stacked_jagged_1d_to_dense.cu',['../stacked__jagged__1d__to__dense_8cu.html',1,'']]], + ['stacked_5fjagged_5f1d_5fto_5fdense_5fcpu_193',['stacked_jagged_1d_to_dense_cpu',['../namespacefbgemm__gpu.html#a6ac9f6d81bff1b8572a380dbe1af00fb',1,'fbgemm_gpu']]], + ['stacked_5fjagged_5f1d_5fto_5fdense_5fgpu_194',['stacked_jagged_1d_to_dense_gpu',['../namespacefbgemm__gpu.html#adf7f39b1a3dd7c2797fd11e740d6269f',1,'fbgemm_gpu']]], + ['stacked_5fjagged_5f2d_5fto_5fdense_2ecu_195',['stacked_jagged_2d_to_dense.cu',['../stacked__jagged__2d__to__dense_8cu.html',1,'']]], + ['stacked_5fjagged_5f2d_5fto_5fdense_5fbackward_5fcuda_196',['stacked_jagged_2d_to_dense_backward_cuda',['../namespacefbgemm__gpu.html#a442efbf57b46780a07ac4759ac1866ee',1,'fbgemm_gpu']]], + ['stacked_5fjagged_5f2d_5fto_5fdense_5fcpu_197',['stacked_jagged_2d_to_dense_cpu',['../namespacefbgemm__gpu.html#ab45e5e415a8929cbd0021eae37e1d881',1,'fbgemm_gpu']]], + ['stacked_5fjagged_5f2d_5fto_5fdense_5fforward_5fcuda_198',['stacked_jagged_2d_to_dense_forward_cuda',['../namespacefbgemm__gpu.html#a5de1d5c177df840f2fa7ab0cdda2aa02',1,'fbgemm_gpu']]], + ['stacked_5fjagged_5f2d_5fto_5fdense_5fgpu_199',['stacked_jagged_2d_to_dense_gpu',['../namespacefbgemm__gpu.html#aaac575e676d094aba1367e9eaf3489bc',1,'fbgemm_gpu']]], + ['start_5finput_200',['start_input',['../namespacefbgemm__gpu.html#abb9cbb13307ba09bfd2a13ca7abbb19b',1,'fbgemm_gpu']]], + ['step_201',['step',['../namespacefbgemm__gpu.html#a17e8d602b1d99905e55e6b875dc306b5',1,'fbgemm_gpu']]], + ['step_202',['STEP',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aad5a825be51026d8249ffccad954dbb5',1,'STEP: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aad5a825be51026d8249ffccad954dbb5',1,'STEP: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['stoc_5frounding_5fstate_5f_203',['stoc_rounding_state_',['../structfbgemm__gpu_1_1_weight_row.html#a666e6a66f2ff524d7dd4339ee8efc9d2',1,'fbgemm_gpu::WeightRow']]], + ['stochastic_5frounding_204',['stochastic_rounding',['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#a5cc1b5faf7430930527acfac8e6b8068',1,'gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['stochastic_5frounding_5finit_205',['stochastic_rounding_init',['../namespacefbgemm__gpu.html#afe523b46c92c9009410f173e4ac434db',1,'fbgemm_gpu']]], + ['stochastic_5frounding_5fphilox_5fargs_206',['stochastic_rounding_philox_args',['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#afff795d859ebc4c98b059d7e04dd8ebd',1,'gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['stochastic_5frounding_5frand4_207',['stochastic_rounding_rand4',['../namespacefbgemm__gpu.html#af0b19e6751891f43372768335cc3c468',1,'fbgemm_gpu']]], + ['stochastic_5frounding_5fvector_208',['stochastic_rounding_vector',['../namespacefbgemm__gpu.html#aec7be9515265c4db67d205f8a3a39822',1,'fbgemm_gpu::stochastic_rounding_vector(dst_t *output, const Vec4T< src_t > &value, StochasticRoundingRNGState &state, const float2)'],['../namespacefbgemm__gpu.html#a06c37bb32cb18b8846cf689db8ed94fb',1,'fbgemm_gpu::stochastic_rounding_vector(at::Half *output, const Vec4T< at::Half > &value, StochasticRoundingRNGState &state, const float2)'],['../namespacefbgemm__gpu.html#a7d41dbbfc3106c8fd5ff37cefbffbc38',1,'fbgemm_gpu::stochastic_rounding_vector(at::Half *output, const Vec4T< float > &value, StochasticRoundingRNGState &state, const float2)'],['../namespacefbgemm__gpu.html#a3313b5c0af7bd07d6e47253a24a27ce7',1,'fbgemm_gpu::stochastic_rounding_vector(uint8_t *output, const Vec4T< float > &value, StochasticRoundingRNGState &state, const float2 qparams)'],['../namespacefbgemm__gpu.html#a44ed26caaddd852d96ee453ea6cc2e07',1,'fbgemm_gpu::stochastic_rounding_vector(uint8_t *output, const Vec4T< at::Half > &value, StochasticRoundingRNGState &state, const float2 qparams)']]], + ['stochasticroundingrngstate_209',['StochasticRoundingRNGState',['../structfbgemm__gpu_1_1_stochastic_rounding_r_n_g_state.html',1,'fbgemm_gpu']]], + ['stop_5f_210',['stop_',['../classssd_1_1_initializer.html#ae1a1eeaa4661c67f8e75985160abf62a',1,'ssd::Initializer']]], + ['storage_5f_211',['storage_',['../memory__utils_8cu.html#a1aaf192027acf281933c714c085e6849',1,'memory_utils.cu']]], + ['store_212',['store',['../structfbgemm__gpu_1_1_half4.html#a89967f417dba84846fa95a0f010d8922',1,'fbgemm_gpu::Half4::store()'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a156eebe566e80706636626c60d2d13b0',1,'fbgemm_gpu::Vec4T< float >::store(float *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#ab31e8852ca6760cf83d6356c8c448596',1,'fbgemm_gpu::Vec4T< float >::store(float4 *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#ac1717b1a00b76b3d368982629c5e8287',1,'fbgemm_gpu::Vec4T< float >::store(at::Half *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a8513259b78c1bcc3e849beea82b95edd',1,'fbgemm_gpu::Vec4T< float >::store(at::BFloat16 *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a42f2f5c890748268ece0df580bbafa44',1,'fbgemm_gpu::Vec4T< float >::store(double *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a85854690aa7af9f8006cf54d577d8e77',1,'fbgemm_gpu::Vec4T< float >::store(uint8_t *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#ac1717b1a00b76b3d368982629c5e8287',1,'fbgemm_gpu::Vec4T< at::Half >::store(at::Half *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a8513259b78c1bcc3e849beea82b95edd',1,'fbgemm_gpu::Vec4T< at::Half >::store(at::BFloat16 *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a156eebe566e80706636626c60d2d13b0',1,'fbgemm_gpu::Vec4T< at::Half >::store(float *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a42f2f5c890748268ece0df580bbafa44',1,'fbgemm_gpu::Vec4T< at::Half >::store(double *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a85854690aa7af9f8006cf54d577d8e77',1,'fbgemm_gpu::Vec4T< at::Half >::store(uint8_t *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#ac1717b1a00b76b3d368982629c5e8287',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::store(at::Half *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a8513259b78c1bcc3e849beea82b95edd',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::store(at::BFloat16 *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a156eebe566e80706636626c60d2d13b0',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::store(float *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a42f2f5c890748268ece0df580bbafa44',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::store(double *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a85854690aa7af9f8006cf54d577d8e77',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::store(uint8_t *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a42f2f5c890748268ece0df580bbafa44',1,'fbgemm_gpu::Vec4T< double >::store(double *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a156eebe566e80706636626c60d2d13b0',1,'fbgemm_gpu::Vec4T< double >::store(float *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#ac1717b1a00b76b3d368982629c5e8287',1,'fbgemm_gpu::Vec4T< double >::store(at::Half *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a8513259b78c1bcc3e849beea82b95edd',1,'fbgemm_gpu::Vec4T< double >::store(at::BFloat16 *p) const'],['../structfbgemm__gpu_1_1_weight_row.html#a2118cba7a45acc1a3d8ea5781badbbe9',1,'fbgemm_gpu::WeightRow::store()'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#a8191536a88223b7249cae8a8cfa97979',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::store(float *output_ptr, int num_valid_outputs=1)'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#ab9651b6b0e85a41131aa086c367d68bd',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::store(at::Half *output_ptr, int num_valid_outputs=1)'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#a0624585ab8592b64edef7a6730938cb9',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::store(at::BFloat16 *output_ptr, const int num_valid_outputs=1)'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#a5ba7b1dad5adec8ae5dc9e4adfe58c38',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::store(uint8_t *output_ptr, int num_valid_outputs=1)'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#ab208ce429674113143ee02d6b9e8a9be',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::store(uint8_t *output_ptr, float2 qparams, int num_valid_outputs=1)'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#ac87524a86f8aa165742c6b793f8fe6aa',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::store(float *output_ptr, float2 qparams, int num_valid_outputs=1)'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#a26ce31b610926ff405b67dc540ff3d95',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::store(at::Half *output_ptr, float2 qparams, int num_valid_outputs=1)'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#a392a5b352be9af9ba86e0cd396e6316a',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::store(at::BFloat16 *output_ptr, float2 qparams, int num_valid_outputs=1)'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a4699624d6b086fa52d88ce1960dc7297',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::store(float *output_ptr, int num_valid_outputs=2)'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a1f0743afcc39c1afeeee6cd9bcdddc35',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::store(at::Half *output_ptr, int num_valid_outputs=2)'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a2c56bff3020a6b803a8310a13b61cfbe',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::store(at::BFloat16 *output_ptr, const int num_valid_outputs=2)'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#ac4e67ed3ba860166333a7805b101490d',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::store(uint8_t *output_ptr, int num_valid_outputs=2)'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a0c562343c84b60da0e5f11ee16e593f2',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::store(uint8_t *output_ptr, float2 qparams, int num_valid_outputs=2)'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a68c214376e86167cbe59755a1caf99a5',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::store(float *output_ptr, float2 qparams, int num_valid_outputs=2)'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a9e2e827bb7e7c608f3acd3953a39e720',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::store(at::Half *output_ptr, float2 qparams, int num_valid_outputs=2)'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a32f2acc26afe1a9cf7d5152567bbd15d',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::store(at::BFloat16 *output_ptr, float2 qparams, int num_valid_outputs=2)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#a75cd31fa56a77c83611b64ddd370a562',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::store(float *output_ptr, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#a81504bf4294b938a3efc8d00acda3b5f',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::store(at::Half *output_ptr, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#a03b4a86f4326d9c24fec2b4dc63439cd',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::store(at::BFloat16 *output_ptr, const int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#ad15c2605b8d982986100c89caa7c0401',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::store(uint8_t *output_ptr, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#a786f9130a8df81af5fc3b0706a1a6545',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::store(uint8_t *output_ptr, float2 qparams, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#afbc2050eefc2350fd0f84db8dd568d14',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::store(float *output_ptr, float2 qparams, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#a12b87408afdd840ed3ae2e1870fa8e2a',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::store(at::Half *output_ptr, float2 qparams, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#a6740fe48ec591c6058b8c5019ca0b599',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::store(at::BFloat16 *output_ptr, float2 qparams, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#a75cd31fa56a77c83611b64ddd370a562',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::store(float *output_ptr, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#a81504bf4294b938a3efc8d00acda3b5f',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::store(at::Half *output_ptr, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#a03b4a86f4326d9c24fec2b4dc63439cd',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::store(at::BFloat16 *output_ptr, const int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#ad15c2605b8d982986100c89caa7c0401',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::store(uint8_t *output_ptr, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#a786f9130a8df81af5fc3b0706a1a6545',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::store(uint8_t *output_ptr, float2 qparams, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#afbc2050eefc2350fd0f84db8dd568d14',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::store(float *output_ptr, float2 qparams, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#a12b87408afdd840ed3ae2e1870fa8e2a',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::store(at::Half *output_ptr, float2 qparams, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#a6740fe48ec591c6058b8c5019ca0b599',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::store(at::BFloat16 *output_ptr, float2 qparams, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#acf9a6b5f9ac186a75bd50800993e7241',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::store(float *output_ptr, int num_valid_outputs=8)'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#aa231a6e5c1ad91305125e2ba8c6cf773',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::store(at::Half *output_ptr, int num_valid_outputs=8)'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#aa2d60424caff50f6d80adfcd1ab5ba3f',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::store(at::BFloat16 *output_ptr, const int num_valid_outputs=8)'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#a5881b8e1b9ca2c81640bad8e6d0a455a',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::store(uint8_t *output_ptr, int num_valid_outputs=8)'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#aa7c2038d0448a12c5edd87eb31f8b828',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::store(uint8_t *output_ptr, float2 qparams, int num_valid_outputs=8)'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#abfaf6f8618474ccb25d58d723792421d',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::store(float *output_ptr, float2 qparams, int num_valid_outputs=8)'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#aad5d604b72b0f656dbeb5e313ebf63af',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::store(at::Half *output_ptr, float2 qparams, int num_valid_outputs=8)'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#a6dfa84a3eb11e20e68d8d3b401c7d2cf',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::store(at::BFloat16 *output_ptr, float2 qparams, int num_valid_outputs=8)'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#a82b07f279fccc086af2208ca7d6d1a3a',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::store(float *output_ptr, int num_valid_outputs=16)'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#a1f25b384b68cdb93ddd010a86f661460',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::store(at::Half *output_ptr, int num_valid_outputs=16)'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#aa9b6e7a0e81a3a3d049e7c632fec2ad7',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::store(at::BFloat16 *output_ptr, const int num_valid_outputs=16)'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#a0d5c2181816bdbb6e5e4998b3fbba721',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::store(uint8_t *output_ptr, int num_valid_outputs=16)'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#a23eb49aef842e89c0f4403d45df27af9',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::store(uint8_t *output_ptr, float2 qparams, int num_valid_outputs=16)'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#a483f290add1c81ba850fda8c574f68bb',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::store(float *output_ptr, float2 qparams, int num_valid_outputs=16)'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#a9b3adeaa52d595467e06b90520c9708a',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::store(at::Half *output_ptr, float2 qparams, int num_valid_outputs=16)'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#aba368627faa071e57a548a336c7bee6b',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::store(at::BFloat16 *output_ptr, float2 qparams, int num_valid_outputs=16)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#ac85ba1113a076bb8a6b6e39ad26bb85d',1,'fbgemm_gpu::Vec4AccT::store(float4 *ptr)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#a775650f6f2480831282ed0a8746998f6',1,'fbgemm_gpu::Vec4AccT::store(float2 *ptr)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#ae4768b5f85cb93226f4e8e7705a32206',1,'fbgemm_gpu::Vec4AccT::store(uint8_t *ptr)']]], + ['store_5f_213',['store_',['../structfbgemm__gpu_1_1_vec4_acc_t.html#aa05890f2dd90061ad3ff516a30e6c196',1,'fbgemm_gpu::Vec4AccT::store_(const float4 *src, float4 *dst)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#a476bc3df6ed11614b47e7c4b1bb440c6',1,'fbgemm_gpu::Vec4AccT::store_(const float4 *src, float2 *dst)']]], + ['store_5fqparams_214',['store_qparams',['../structfbgemm__gpu_1_1_weight_row.html#a7e20dc1480b5220df335895b7ac6bdd0',1,'fbgemm_gpu::WeightRow']]], + ['store_5fqparams_5fto_5frow_215',['store_qparams_to_row',['../namespacefbgemm__gpu.html#a8afc4c2510a6db3d420fc1025d3ac30b',1,'fbgemm_gpu::store_qparams_to_row(emb_t *ptr, float2 qparams)'],['../namespacefbgemm__gpu.html#af4ec15f5d6826c016c46b5d7cae62d72',1,'fbgemm_gpu::store_qparams_to_row(uint8_t *ptr, float2 qparams)']]], + ['stride_216',['stride',['../classfbgemm__gpu_1_1_tensor_accessor_base.html#a396d81b04ec72f4281d15a02c7840694',1,'fbgemm_gpu::TensorAccessorBase::stride()'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#a396d81b04ec72f4281d15a02c7840694',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::stride()'],['../namespacefbgemm__gpu.html#a85f38ec0d4f8474b6d4ccad168974cf9',1,'fbgemm_gpu::stride']]], + ['strides_217',['strides',['../classfbgemm__gpu_1_1_tensor_accessor_base.html#af446bd0965fd0586067d176a1630a6c1',1,'fbgemm_gpu::TensorAccessorBase']]], + ['strides_5f_218',['strides_',['../classfbgemm__gpu_1_1_tensor_accessor_base.html#a44a19ea7efb719bdd2baac00add90d40',1,'fbgemm_gpu::TensorAccessorBase::strides_'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#a20a855fc09b0bad6cc73895d2bd48bea',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::strides_']]], + ['stringify_219',['STRINGIFY',['../_c_make_c_compiler_id_8c.html#a43e1cad902b6477bec893cb6430bd6c8',1,'STRINGIFY: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#a43e1cad902b6477bec893cb6430bd6c8',1,'STRINGIFY: CMakeCXXCompilerId.cpp']]], + ['stringify_5fhelper_220',['STRINGIFY_HELPER',['../_c_make_c_compiler_id_8c.html#a2ae9b72bb13abaabfcf2ee0ba7d3fa1d',1,'STRINGIFY_HELPER: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#a2ae9b72bb13abaabfcf2ee0ba7d3fa1d',1,'STRINGIFY_HELPER: CMakeCXXCompilerId.cpp']]], + ['sum_221',['sum',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#a2b4995ca44cb8977ca258395e80a8687',1,'fbgemm_gpu::Vec4StepT< STEP, float >::sum()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#a2b4995ca44cb8977ca258395e80a8687',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::sum()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#a2b4995ca44cb8977ca258395e80a8687',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::sum()'],['../namespacefbgemm__gpu.html#adb83758639b252f212d790847ca2f6b6',1,'fbgemm_gpu::sum']]], + ['sum_222',['SUM',['../namespacefbgemm__gpu.html#aa1f721fe0d5e5a710e7a05f788f01f5da6970bdc2201030b9c03fbdcf3973858a',1,'fbgemm_gpu']]], + ['sum_5fe_223',['sum_E',['../namespacefbgemm__gpu.html#aba761028ac72c20c7defaef09de61d95',1,'fbgemm_gpu']]], + ['sum_5freduce_5fto_5fone_5fdevice_224',['sum_reduce_to_one_device',['../namespacefbgemm__gpu.html#aa7f73354e0c76fbc0584c3250dadc98e',1,'fbgemm_gpu']]], + ['syncwarp_225',['syncwarp',['../namespacefbgemm__gpu.html#ab776b7b9076d17238d502b2746135ace',1,'fbgemm_gpu']]] +]; diff --git a/search/all_14.js b/search/all_14.js new file mode 100644 index 000000000..4f4c8dffc --- /dev/null +++ b/search/all_14.js @@ -0,0 +1,75 @@ +var searchData= +[ + ['t_0',['t',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a886f5e3baf03935340ae10c910916eb9',1,'t: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a886f5e3baf03935340ae10c910916eb9',1,'t: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../namespacefbgemm__gpu.html#aa80cbea4714c980d14626fd87c9287a4',1,'fbgemm_gpu::t']]], + ['t_1',['T',['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#a2ee4b3e799d56c4d34c87190c37a7a64',1,'T: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#abb3518c2e8a95fe2496d295fe14b91df',1,'T: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#a2ee4b3e799d56c4d34c87190c37a7a64',1,'T: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#abb3518c2e8a95fe2496d295fe14b91df',1,'T: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#abb3518c2e8a95fe2496d295fe14b91df',1,'T: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#abb3518c2e8a95fe2496d295fe14b91df',1,'T: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../namespacefbgemm__gpu.html#a2bef322c4183a01bc9d8e3c084ae1d15',1,'fbgemm_gpu::T']]], + ['table_5foffset_2',['table_offset',['../namespacefbgemm__gpu.html#a242d5a911279d9ad2128346af039383f',1,'fbgemm_gpu']]], + ['table_5foffsets_3',['table_offsets',['../namespacefbgemm__gpu.html#a114a2ddecfbdbb209bc791977fcb1c0e',1,'fbgemm_gpu']]], + ['table_5fwarp_5fid_4',['table_warp_id',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a557e928f55b7bdfe7824b6ddd0fcfbff',1,'table_warp_id: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a557e928f55b7bdfe7824b6ddd0fcfbff',1,'table_warp_id: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['tbe_5finput_5fcombine_5fcpu_5',['tbe_input_combine_cpu',['../group__input-combine.html#ga4f8f3f8b825c9d7639c1e45e8dc8b689',1,'fbgemm_gpu']]], + ['tbe_5finput_5fcombine_5fwith_5flength_5fcpu_6',['tbe_input_combine_with_length_cpu',['../namespacefbgemm__gpu.html#a56da764643d07d366219d69333e6f9de',1,'fbgemm_gpu']]], + ['tbe_5finput_5fcombine_5fwith_5flength_5fcuda_7',['tbe_input_combine_with_length_cuda',['../namespacefbgemm__gpu.html#ae818a54243bd2ea4c0841088f07ff327',1,'fbgemm_gpu']]], + ['tbe_5finput_5fcombine_5fwith_5flength_5fgpu_8',['tbe_input_combine_with_length_gpu',['../namespacefbgemm__gpu.html#af7db32b23d955e760c7dfb4b29a13ca1',1,'fbgemm_gpu']]], + ['temp_5fstorage_9',['temp_storage',['../namespacefbgemm__gpu.html#ad0fce99009259dbc5e5c0527eb5b3f64',1,'fbgemm_gpu']]], + ['tensor_10',['Tensor',['../gen__batch__index__select__dim0__backward__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_batch_index_select_dim0_backward_codegen_cuda.cu'],['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_batch_index_select_dim0_backward_kernel_warp.cu'],['../gen__batch__index__select__dim0__forward__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_batch_index_select_dim0_forward_codegen_cuda.cu'],['../gen__batch__index__select__dim0__forward__kernel_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_batch_index_select_dim0_forward_kernel_small.cu'],['../gen__embedding__backward__adagrad__split__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_adagrad_split_cpu.cpp'],['../gen__embedding__backward__adagrad__split__unweighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_adam_split_unweighted_cuda.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_adam_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_adam_split_weighted_cuda.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__indice__weights__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_dense_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__dense__split__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_dense_split_cpu.cpp'],['../gen__embedding__backward__dense__split__unweighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_dense_split_unweighted_cuda.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_dense_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_dense_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_dense_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_dense_split_weighted_cuda.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_lamb_split_unweighted_cuda.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_lamb_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_lamb_split_weighted_cuda.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_lars_sgd_split_unweighted_cuda.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_lars_sgd_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_lars_sgd_split_weighted_cuda.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_none_split_unweighted_cuda.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_none_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_none_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_none_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_none_split_weighted_cuda.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_partial_rowwise_adam_split_unweighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_partial_rowwise_adam_split_weighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_partial_rowwise_lamb_split_weighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_split_cpu.cpp'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_with_counter_split_cpu.cpp'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_weighted_adagrad_split_cpu.cpp'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_sgd_split_cpu.cpp'],['../gen__embedding__backward__sgd__split__unweighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_sgd_split_unweighted_cuda.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_sgd_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_sgd_split_unweighted_vbe_cuda.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_sgd_split_weighted_cuda.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_sgd_split_weighted_vbe_cuda.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__split__adagrad_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_adagrad.cpp'],['../gen__embedding__backward__split__adagrad__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_adagrad_cpu.cpp'],['../gen__embedding__backward__split__adam_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_adam.cpp'],['../gen__embedding__backward__split__adam__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_adam_cpu.cpp'],['../gen__embedding__backward__split__approx__rowwise__adagrad_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_approx_rowwise_adagrad.cpp'],['../gen__embedding__backward__split__approx__rowwise__adagrad__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_approx_rowwise_adagrad_cpu.cpp'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__counter_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_approx_rowwise_adagrad_with_counter.cpp'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__counter__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_approx_rowwise_adagrad_with_counter_cpu.cpp'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_approx_rowwise_adagrad_with_weight_decay.cpp'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_approx_rowwise_adagrad_with_weight_decay_cpu.cpp'],['../gen__embedding__backward__split__approx__sgd_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_approx_sgd.cpp'],['../gen__embedding__backward__split__approx__sgd__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_approx_sgd_cpu.cpp'],['../gen__embedding__backward__split__grad_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_grad.cu'],['../gen__embedding__backward__split__indice__weights__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__lamb_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_lamb.cpp'],['../gen__embedding__backward__split__lamb__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_lamb_cpu.cpp'],['../gen__embedding__backward__split__lars__sgd_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_lars_sgd.cpp'],['../gen__embedding__backward__split__lars__sgd__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_lars_sgd_cpu.cpp'],['../gen__embedding__backward__split__none_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_none.cpp'],['../gen__embedding__backward__split__none__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_none_cpu.cpp'],['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_partial_rowwise_adam.cpp'],['../gen__embedding__backward__split__partial__rowwise__adam__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_partial_rowwise_adam_cpu.cpp'],['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_partial_rowwise_lamb.cpp'],['../gen__embedding__backward__split__partial__rowwise__lamb__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_partial_rowwise_lamb_cpu.cpp'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_rowwise_adagrad.cpp'],['../gen__embedding__backward__split__rowwise__adagrad__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_rowwise_adagrad_cpu.cpp'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_rowwise_adagrad_with_counter.cpp'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_rowwise_adagrad_with_counter_cpu.cpp'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_rowwise_adagrad_with_weight_decay.cpp'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_rowwise_adagrad_with_weight_decay_cpu.cpp'],['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_rowwise_weighted_adagrad.cpp'],['../gen__embedding__backward__split__rowwise__weighted__adagrad__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_rowwise_weighted_adagrad_cpu.cpp'],['../gen__embedding__backward__split__sgd_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_sgd.cpp'],['../gen__embedding__backward__split__sgd__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_backward_split_sgd_cpu.cpp'],['../gen__embedding__forward__dense__unweighted__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_dense_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__dense__unweighted__codegen__meta_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_dense_unweighted_codegen_meta.cpp'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__dense__weighted__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_dense_weighted_codegen_cuda.cu'],['../gen__embedding__forward__dense__weighted__codegen__meta_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_dense_weighted_codegen_meta.cpp'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__fp16__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_kernel_unweighted_fp16_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__fp32__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_kernel_unweighted_fp32_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__fp8__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_kernel_unweighted_fp8_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__int2__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_kernel_unweighted_int2_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__int4__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_kernel_unweighted_int4_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__int8__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_kernel_unweighted_int8_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__fp16__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_fp16_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__fp32__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_fp32_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__fp8__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_fp8_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__int2__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_int2_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__int4__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_int4_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__int8__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_int8_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__kernel__weighted__fp16__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_kernel_weighted_fp16_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__kernel__weighted__fp32__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_kernel_weighted_fp32_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__kernel__weighted__fp8__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_kernel_weighted_fp8_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__kernel__weighted__int2__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_kernel_weighted_int2_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__kernel__weighted__int4__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_kernel_weighted_int4_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__kernel__weighted__int8__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_quantized_split_nbit_kernel_weighted_int8_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__codegen__meta_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_split_unweighted_codegen_meta.cpp'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_split_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__meta_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_split_unweighted_vbe_codegen_meta.cpp'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__meta_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_split_weighted_codegen_meta.cpp'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__meta_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_split_weighted_vbe_codegen_meta.cpp'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__optimizer__rowwise__adagrad__split_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_optimizer_rowwise_adagrad_split.cpp'],['../gen__embedding__optimizer__rowwise__adagrad__split__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: gen_embedding_optimizer_rowwise_adagrad_split_cuda.cu'],['../batch__index__select__dim0__cpu__host_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: batch_index_select_dim0_cpu_host.cpp'],['../batch__index__select__dim0__host_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: batch_index_select_dim0_host.cpp'],['../embedding__backward__dense__host_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: embedding_backward_dense_host.cpp'],['../embedding__backward__dense__host__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: embedding_backward_dense_host_cpu.cpp'],['../embedding__backward__split__cpu__approx__template_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: embedding_backward_split_cpu_approx_template.cpp'],['../embedding__backward__split__cpu__template_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: embedding_backward_split_cpu_template.cpp'],['../embedding__backward__split__grad__template_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: embedding_backward_split_grad_template.cu'],['../embedding__backward__split__host__cpu__template_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: embedding_backward_split_host_cpu_template.cpp'],['../embedding__backward__split__host__template_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: embedding_backward_split_host_template.cpp'],['../embedding__bounds__check_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: embedding_bounds_check.cu'],['../embedding__bounds__check__host_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: embedding_bounds_check_host.cpp'],['../embedding__bounds__check__host__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: embedding_bounds_check_host_cpu.cpp'],['../embedding__forward__quantized__host_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: embedding_forward_quantized_host.cpp'],['../embedding__forward__quantized__host__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: embedding_forward_quantized_host_cpu.cpp'],['../embedding__forward__quantized__split__lookup_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: embedding_forward_quantized_split_lookup.cu'],['../embedding__forward__quantized__split__nbit__host__template_8cu.html#a1c03911dcc4fa0b0d2819531e1148a4f',1,'Tensor: embedding_forward_quantized_split_nbit_host_template.cu'],['../embedding__forward__quantized__split__nbit__kernel__template_8cu.html#a1c03911dcc4fa0b0d2819531e1148a4f',1,'Tensor: embedding_forward_quantized_split_nbit_kernel_template.cu'],['../embedding__forward__split__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: embedding_forward_split_cpu.cpp'],['../embedding__forward__split__meta__template_8cpp.html#abd5e6ae11c42b2e53f1da6fa1f4646ed',1,'Tensor: embedding_forward_split_meta_template.cpp'],['../embedding__optimizer__split__host__template_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: embedding_optimizer_split_host_template.cpp'],['../embedding__optimizer__split__template_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: embedding_optimizer_split_template.cu'],['../namespacefbgemm__gpu.html#ae2016e9bbb2f470174708fc60cd7592f',1,'fbgemm_gpu::Tensor'],['../embedding__inplace__update_8h.html#abc1167888f441327c12e300780ee568a',1,'Tensor: embedding_inplace_update.h'],['../embedding__inplace__update_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: embedding_inplace_update.cu'],['../embedding__inplace__update__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: embedding_inplace_update_cpu.cpp'],['../histogram__binning__calibration__ops_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: histogram_binning_calibration_ops.cu'],['../input__combine_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: input_combine.cu'],['../input__combine__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: input_combine_cpu.cpp'],['../input__combine__gpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: input_combine_gpu.cpp'],['../batched__dense__vec__jagged__2d__mul__backward_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: batched_dense_vec_jagged_2d_mul_backward.cu'],['../batched__dense__vec__jagged__2d__mul__forward_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: batched_dense_vec_jagged_2d_mul_forward.cu'],['../dense__to__jagged__forward_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: dense_to_jagged_forward.cu'],['../jagged__dense__bmm__forward_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: jagged_dense_bmm_forward.cu'],['../jagged__dense__dense__elementwise__add__jagged__output__forward_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: jagged_dense_dense_elementwise_add_jagged_output_forward.cu'],['../jagged__dense__elementwise__mul__backward_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: jagged_dense_elementwise_mul_backward.cu'],['../jagged__dense__elementwise__mul__forward_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: jagged_dense_elementwise_mul_forward.cu'],['../jagged__index__add__2d__forward_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: jagged_index_add_2d_forward.cu'],['../jagged__index__select__2d__forward_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: jagged_index_select_2d_forward.cu'],['../jagged__jagged__bmm__forward_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: jagged_jagged_bmm_forward.cu'],['../jagged__softmax__backward_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: jagged_softmax_backward.cu'],['../jagged__softmax__forward_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: jagged_softmax_forward.cu'],['../jagged__to__padded__dense__backward_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: jagged_to_padded_dense_backward.cu'],['../jagged__to__padded__dense__forward_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: jagged_to_padded_dense_forward.cu'],['../jagged__unique__indices_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: jagged_unique_indices.cu'],['../keyed__jagged__index__select__dim1_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: keyed_jagged_index_select_dim1.cu'],['../layout__transform__ops_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: layout_transform_ops.cu'],['../layout__transform__ops__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: layout_transform_ops_cpu.cpp'],['../memory__utils_2common_8h.html#abc1167888f441327c12e300780ee568a',1,'Tensor: common.h'],['../memory__utils_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: memory_utils.cpp'],['../memory__utils__ops_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: memory_utils_ops.cpp'],['../memory__utils__ops__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: memory_utils_ops_cpu.cpp'],['../merge__pooled__embedding__ops__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: merge_pooled_embedding_ops_cpu.cpp'],['../merge__pooled__embedding__ops__gpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: merge_pooled_embedding_ops_gpu.cpp'],['../permute__pooled__embedding__function_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: permute_pooled_embedding_function.cpp'],['../permute__pooled__embedding__ops_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: permute_pooled_embedding_ops.cu'],['../permute__pooled__embedding__ops__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: permute_pooled_embedding_ops_cpu.cpp'],['../permute__pooled__embedding__ops__gpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: permute_pooled_embedding_ops_gpu.cpp'],['../permute__pooled__embedding__ops__split_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: permute_pooled_embedding_ops_split.cu'],['../permute__pooled__embedding__ops__split__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: permute_pooled_embedding_ops_split_cpu.cpp'],['../permute__pooled__embedding__ops__split__gpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: permute_pooled_embedding_ops_split_gpu.cpp'],['../quantize__ops_2common_8cuh.html#abc1167888f441327c12e300780ee568a',1,'Tensor: common.cuh'],['../quantize__bfloat16_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: quantize_bfloat16.cu'],['../quantize__fp8__rowwise_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: quantize_fp8_rowwise.cu'],['../quantize__fused__8bit__rowwise_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: quantize_fused_8bit_rowwise.cu'],['../quantize__fused__nbit__rowwise_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: quantize_fused_nbit_rowwise.cu'],['../quantize__hfp8_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: quantize_hfp8.cu'],['../quantize__msfp_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: quantize_msfp.cu'],['../quantize__ops__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: quantize_ops_cpu.cpp'],['../quantize__ops__meta_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: quantize_ops_meta.cpp'],['../quantize__padded__fp8__rowwise_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: quantize_padded_fp8_rowwise.cu'],['../sparse__ops_2common_8cuh.html#abc1167888f441327c12e300780ee568a',1,'Tensor: common.cuh'],['../sparse__async__cumsum_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_async_cumsum.cu'],['../sparse__batched__unary__embeddings_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_batched_unary_embeddings.cu'],['../sparse__block__bucketize__features_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_block_bucketize_features.cu'],['../sparse__bucketize__features_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_bucketize_features.cu'],['../sparse__compute__frequency__sequence_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_compute_frequency_sequence.cu'],['../sparse__expand__into__jagged__permute_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_expand_into_jagged_permute.cu'],['../sparse__group__index_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_group_index.cu'],['../sparse__index__add_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_index_add.cu'],['../sparse__index__select_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_index_select.cu'],['../sparse__invert__permute_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_invert_permute.cu'],['../sparse__ops__cpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_ops_cpu.cpp'],['../sparse__ops__gpu_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_ops_gpu.cpp'],['../sparse__ops__meta_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_ops_meta.cpp'],['../sparse__pack__segments__backward_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_pack_segments_backward.cu'],['../sparse__pack__segments__forward_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_pack_segments_forward.cu'],['../sparse__permute102_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_permute102.cu'],['../sparse__permute__1d_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_permute_1d.cu'],['../sparse__permute__2d_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_permute_2d.cu'],['../sparse__permute__embeddings_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_permute_embeddings.cu'],['../sparse__range_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_range.cu'],['../sparse__reorder__batched__ad_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_reorder_batched_ad.cu'],['../sparse__segment__sum__csr_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_segment_sum_csr.cu'],['../sparse__zipf_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: sparse_zipf.cu'],['../split__embeddings__cache_2common_8cuh.html#abc1167888f441327c12e300780ee568a',1,'Tensor: common.cuh'],['../split__embeddings__cache_2common_8h.html#abc1167888f441327c12e300780ee568a',1,'Tensor: common.h'],['../lfu__cache__find_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: lfu_cache_find.cu'],['../lfu__cache__populate_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: lfu_cache_populate.cu'],['../lfu__cache__populate__byte_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: lfu_cache_populate_byte.cpp'],['../lfu__cache__populate__byte_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: lfu_cache_populate_byte.cu'],['../linearize__cache__indices_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: linearize_cache_indices.cpp'],['../linearize__cache__indices_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: linearize_cache_indices.cu'],['../lru__cache__find_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: lru_cache_find.cu'],['../lru__cache__populate_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: lru_cache_populate.cu'],['../lru__cache__populate__byte_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: lru_cache_populate_byte.cpp'],['../lru__cache__populate__byte_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: lru_cache_populate_byte.cu'],['../lxu__cache_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: lxu_cache.cpp'],['../lxu__cache_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: lxu_cache.cu'],['../reset__weight__momentum_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: reset_weight_momentum.cu'],['../generate__vbe__metadata_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: generate_vbe_metadata.cu'],['../get__infos__metadata_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: get_infos_metadata.cu'],['../radix__sort__pairs_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: radix_sort_pairs.cu'],['../split__embeddings__utils_8cpp.html#abc1167888f441327c12e300780ee568a',1,'Tensor: split_embeddings_utils.cpp'],['../transpose__embedding__input_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: transpose_embedding_input.cu'],['../ssd__split__embeddings__cache__cuda_8cu.html#abc1167888f441327c12e300780ee568a',1,'Tensor: ssd_split_embeddings_cache_cuda.cu']]], + ['tensor_20cuda_20operators_11',['Jagged Tensor CUDA Operators',['../group__jagged-tensor-ops-cuda.html',1,'']]], + ['tensor_20operators_12',['Jagged Tensor Operators',['../group__jagged-tensor-ops-cpu.html',1,'']]], + ['tensor_5fassert_5ftest_2ecpp_13',['tensor_assert_test.cpp',['../tensor__assert__test_8cpp.html',1,'']]], + ['tensor_5fcontiguous_14',['TENSOR_CONTIGUOUS',['../sparse__ops__utils_8h.html#a333341c9590667c47753510e0da7b6e3',1,'sparse_ops_utils.h']]], + ['tensor_5fcontiguous_5fand_5fon_5fcpu_15',['TENSOR_CONTIGUOUS_AND_ON_CPU',['../sparse__ops__utils_8h.html#a0378cd5f9e716f13079b83a9b9805691',1,'sparse_ops_utils.h']]], + ['tensor_5fcontiguous_5fand_5fon_5fcuda_5fgpu_16',['TENSOR_CONTIGUOUS_AND_ON_CUDA_GPU',['../sparse__ops__utils_8h.html#a350ade6aa989687c2ca8ced000e200ff',1,'sparse_ops_utils.h']]], + ['tensor_5fempty_5for_5fon_5fcpu_17',['TENSOR_EMPTY_OR_ON_CPU',['../sparse__ops__utils_8h.html#a73ab1987fec37ac982ae1ed77be0e3ea',1,'sparse_ops_utils.h']]], + ['tensor_5fempty_5for_5fon_5fcuda_5fgpu_18',['TENSOR_EMPTY_OR_ON_CUDA_GPU',['../sparse__ops__utils_8h.html#aff83e4ada08cf70146ffc4ac2009aa9a',1,'sparse_ops_utils.h']]], + ['tensor_5fndim_5fequals_19',['TENSOR_NDIM_EQUALS',['../sparse__ops__utils_8h.html#a485f848acf189619cb61a0ae7534eaa1',1,'sparse_ops_utils.h']]], + ['tensor_5fndim_5fexceeds_20',['TENSOR_NDIM_EXCEEDS',['../sparse__ops__utils_8h.html#acfab048550cb0518bdb1ac267ef1e7ba',1,'sparse_ops_utils.h']]], + ['tensor_5fndim_5fis_5fge_21',['TENSOR_NDIM_IS_GE',['../sparse__ops__utils_8h.html#abd9e69a82885e6e361275a0b08ebe565',1,'sparse_ops_utils.h']]], + ['tensor_5fon_5fcpu_22',['TENSOR_ON_CPU',['../sparse__ops__utils_8h.html#a5d19d4051835acd2c6d83eb637341010',1,'sparse_ops_utils.h']]], + ['tensor_5fon_5fcuda_5fgpu_23',['TENSOR_ON_CUDA_GPU',['../sparse__ops__utils_8h.html#ac6089c2908cb1ae6367af5cf7bbea30d',1,'sparse_ops_utils.h']]], + ['tensor_5fon_5fsame_5fgpu_5fif_5fnot_5foptional_5fcheck_24',['tensor_on_same_gpu_if_not_optional_check',['../sparse__ops__utils_8h.html#a5a8411338d3eef3620c7f5be3803c7cd',1,'sparse_ops_utils.h']]], + ['tensor_5ftype_5fmust_5fbe_25',['TENSOR_TYPE_MUST_BE',['../sparse__ops__utils_8h.html#a003b5640cfa59fe8f5da9b1c9fcb8f26',1,'sparse_ops_utils.h']]], + ['tensoraccessor_26',['TensorAccessor',['../classfbgemm__gpu_1_1_tensor_accessor.html',1,'TensorAccessor< T, N, PtrTraits, index_t >'],['../classfbgemm__gpu_1_1_tensor_accessor.html#a6b681d8fc7f13b4b8d31426ec10a0f11',1,'fbgemm_gpu::TensorAccessor::TensorAccessor()'],['../classfbgemm__gpu_1_1_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html#a55169dff4cc835156c5ccd43240b4c8c',1,'fbgemm_gpu::TensorAccessor< T, 1, PtrTraits, index_t >::TensorAccessor()']]], + ['tensoraccessor_3c_20t_2c_201_2c_20ptrtraits_2c_20index_5ft_20_3e_27',['TensorAccessor< T, 1, PtrTraits, index_t >',['../classfbgemm__gpu_1_1_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html',1,'fbgemm_gpu']]], + ['tensoraccessorbase_28',['TensorAccessorBase',['../classfbgemm__gpu_1_1_tensor_accessor_base.html',1,'TensorAccessorBase< T, N, PtrTraits, index_t >'],['../classfbgemm__gpu_1_1_tensor_accessor_base.html#ac139dc2b8e88aec4b189a6c41bc135af',1,'fbgemm_gpu::TensorAccessorBase::TensorAccessorBase()']]], + ['tensoraccessorbase_3c_20t_2c_201_2c_20ptrtraits_2c_20index_5ft_20_3e_29',['TensorAccessorBase< T, 1, PtrTraits, index_t >',['../classfbgemm__gpu_1_1_tensor_accessor_base.html',1,'fbgemm_gpu']]], + ['tensoraccessorbase_3c_20t_2c_20n_2c_20defaultptrtraits_2c_20int64_5ft_20_3e_30',['TensorAccessorBase< T, N, DefaultPtrTraits, int64_t >',['../classfbgemm__gpu_1_1_tensor_accessor_base.html',1,'fbgemm_gpu']]], + ['tensors_5fempty_5for_5fon_5fsame_5fdevice_31',['TENSORS_EMPTY_OR_ON_SAME_DEVICE',['../sparse__ops__utils_8h.html#a3df91ae56fe10d1c002bed63e5b78d1b',1,'sparse_ops_utils.h']]], + ['tensors_5fhave_5fsame_5fnumel_32',['TENSORS_HAVE_SAME_NUMEL',['../sparse__ops__utils_8h.html#a9be1e573e7d3e35f3db03210e2624e61',1,'sparse_ops_utils.h']]], + ['tensors_5fhave_5fsame_5ftype_33',['TENSORS_HAVE_SAME_TYPE',['../sparse__ops__utils_8h.html#a97687675a3398d3168fe8f07a1b4db87',1,'sparse_ops_utils.h']]], + ['tensors_5fon_5fsame_5fcuda_5fgpu_5fif_5fnot_5foptional_34',['TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL',['../sparse__ops__utils_8h.html#a4724e1d67266b6998b8fe4ef1ec743d9',1,'sparse_ops_utils.h']]], + ['tensors_5fon_5fsame_5fdevice_35',['TENSORS_ON_SAME_DEVICE',['../sparse__ops__utils_8h.html#aa6ef8e13e3280066cc5f4f0970d3e7a6',1,'sparse_ops_utils.h']]], + ['test_36',['TEST',['../embedding__inplace__update__test_8cpp.html#a8eb96d7f557ba896e48fef81f259d7a5',1,'TEST(EmbeddingInplaceUpdateTest, random_update): embedding_inplace_update_test.cpp'],['../cpu__kernel__test_8cpp.html#aa2c7091971cf4fd4bcbb3215ebe612cf',1,'TEST(cpu_kernel_test, csr2csc_test): cpu_kernel_test.cpp'],['../sparse__ops__utils__test_8cpp.html#a9011669ae997bae59aa8f141bd794f11',1,'TEST(sparse_ops_utils_test, undefined_tensors_do_not_trigger): sparse_ops_utils_test.cpp'],['../sparse__ops__utils__test_8cpp.html#a2d4ac7a4fb22c0789d8510d17f3878db',1,'TEST(sparse_ops_utils_test, cpu_tensors_fail): sparse_ops_utils_test.cpp'],['../sparse__ops__utils__test_8cpp.html#adc3b9330a7cac1cf2e07268fe7a6bd17',1,'TEST(sparse_ops_utils_test, gpu_tensors_pass): sparse_ops_utils_test.cpp'],['../sparse__ops__utils__test_8cpp.html#ae888046a03bb3fe0f87d23c4915f6994',1,'TEST(sparse_ops_utils_test, optional_tensor_passes): sparse_ops_utils_test.cpp'],['../tensor__assert__test_8cpp.html#af3ce575ab5810b31aae3455d53faacee',1,'TEST(tensor_assert_test, gpu_asserts): tensor_assert_test.cpp'],['../uvm__cache__miss__emulate__test_8cpp.html#aab721325808448b876b97faee4b751b9',1,'TEST(uvm_cache_miss_emulate_test, no_cache_miss): uvm_cache_miss_emulate_test.cpp'],['../uvm__cache__miss__emulate__test_8cpp.html#acdba631ddc8a5dc4e4ee2c02959d3e14',1,'TEST(uvm_cache_miss_emulate_test, enforced_cache_miss): uvm_cache_miss_emulate_test.cpp']]], + ['test_5fembedding_5finplace_5fupdate_37',['test_embedding_inplace_update',['../embedding__inplace__update__test_8cpp.html#aac82e2990c8f2f7d3957f862975181a0',1,'embedding_inplace_update_test.cpp']]], + ['thrust_5ffind_5fqparams_38',['thrust_find_qparams',['../namespacefbgemm__gpu.html#a6c54f589eee05a58cebd4cf7cf8b1086',1,'fbgemm_gpu::thrust_find_qparams(scalar_t *input_row, int D)'],['../namespacefbgemm__gpu.html#a8145ebe65a5242bd7a3a15de0d69a70b',1,'fbgemm_gpu::thrust_find_qparams(fbgemm_gpu::Vec4T< scalar_t > *input_row, int D)']]], + ['to_5fbfloat16_39',['to_bfloat16',['../namespacefbgemm__gpu.html#a9d1e20705b5c1c16dd554c81b3766b93',1,'fbgemm_gpu']]], + ['to_5fbfloat16_5f16_40',['to_bfloat16_16',['../namespacefbgemm__gpu.html#a3f6b99cce95aa3d297e4b824e577d62d',1,'fbgemm_gpu']]], + ['to_5fbfloat16_5f2_41',['to_bfloat16_2',['../namespacefbgemm__gpu.html#a2b8a7fb1619f338df717ef075fe513e4',1,'fbgemm_gpu']]], + ['to_5fbfloat16_5f4_42',['to_bfloat16_4',['../namespacefbgemm__gpu.html#a7d0d7114d05a683328a782804ef2bef9',1,'fbgemm_gpu']]], + ['to_5fbfloat16_5f8_43',['to_bfloat16_8',['../namespacefbgemm__gpu.html#a74f150a063fed3144f6d99cde2d46069',1,'fbgemm_gpu']]], + ['to_5fhalf_44',['to_half',['../namespacefbgemm__gpu.html#a3e13c4ba1e371f3bcabf7f6f74ac103e',1,'fbgemm_gpu']]], + ['to_5fhalf16_45',['to_half16',['../namespacefbgemm__gpu.html#a776872b9c8f667b7d05aea83e7287d5d',1,'fbgemm_gpu']]], + ['to_5fhalf2_46',['to_half2',['../namespacefbgemm__gpu.html#aaed7807ac8eef0fb786324d5935c4aca',1,'fbgemm_gpu']]], + ['to_5fhalf4_47',['to_half4',['../namespacefbgemm__gpu.html#aee1f23de5e5847146cd821595d1978ae',1,'fbgemm_gpu']]], + ['to_5fhalf8_48',['to_half8',['../namespacefbgemm__gpu.html#a40088f5e88d0985b0c9b08808c40e1dd',1,'fbgemm_gpu']]], + ['topology_5futils_2ecpp_49',['topology_utils.cpp',['../topology__utils_8cpp.html',1,'']]], + ['topology_5futils_2eh_50',['topology_utils.h',['../topology__utils_8h.html',1,'']]], + ['torch_5flibrary_5ffragment_51',['TORCH_LIBRARY_FRAGMENT',['../gen__embedding__backward__adagrad__split__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_adagrad_split_cpu.cpp'],['../gen__embedding__backward__adagrad__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__adagrad__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__adam__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_adam_split_unweighted_cuda.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_adam_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__adam__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_adam_split_weighted_cuda.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu'],['../gen__embedding__backward__dense__indice__weights__codegen__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_dense_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__dense__split__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_dense_split_cpu.cpp'],['../gen__embedding__backward__dense__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_dense_split_unweighted_cuda.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_dense_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__dense__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_dense_split_weighted_cuda.cu'],['../gen__embedding__backward__lamb__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_lamb_split_unweighted_cuda.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_lamb_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__lamb__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_lamb_split_weighted_cuda.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_lars_sgd_split_unweighted_cuda.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_lars_sgd_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_lars_sgd_split_weighted_cuda.cu'],['../gen__embedding__backward__none__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_none_split_unweighted_cuda.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_none_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__none__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_none_split_weighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_partial_rowwise_adam_split_unweighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_partial_rowwise_adam_split_weighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_partial_rowwise_lamb_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_split_cpu.cpp'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_with_counter_split_cpu.cpp'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_weighted_adagrad_split_cpu.cpp'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__sgd__split__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_sgd_split_cpu.cpp'],['../gen__embedding__backward__sgd__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_sgd_split_unweighted_cuda.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_sgd_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_sgd_split_unweighted_vbe_cuda.cu'],['../gen__embedding__backward__sgd__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_sgd_split_weighted_cuda.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_sgd_split_weighted_vbe_cuda.cu'],['../gen__embedding__backward__split__adagrad_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_adagrad.cpp'],['../gen__embedding__backward__split__adagrad_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_adagrad.cpp'],['../gen__embedding__backward__split__adam_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_adam.cpp'],['../gen__embedding__backward__split__adam_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_adam.cpp'],['../gen__embedding__backward__split__approx__rowwise__adagrad_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_approx_rowwise_adagrad.cpp'],['../gen__embedding__backward__split__approx__rowwise__adagrad_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_approx_rowwise_adagrad.cpp'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__counter_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_approx_rowwise_adagrad_with_counter.cpp'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__counter_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_approx_rowwise_adagrad_with_counter.cpp'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_approx_rowwise_adagrad_with_weight_decay.cpp'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_approx_rowwise_adagrad_with_weight_decay.cpp'],['../gen__embedding__backward__split__approx__sgd_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_approx_sgd.cpp'],['../gen__embedding__backward__split__approx__sgd_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_approx_sgd.cpp'],['../gen__embedding__backward__split__indice__weights__codegen__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__lamb_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_lamb.cpp'],['../gen__embedding__backward__split__lamb_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_lamb.cpp'],['../gen__embedding__backward__split__lars__sgd_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_lars_sgd.cpp'],['../gen__embedding__backward__split__lars__sgd_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_lars_sgd.cpp'],['../gen__embedding__backward__split__none_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_none.cpp'],['../gen__embedding__backward__split__none_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_none.cpp'],['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_partial_rowwise_adam.cpp'],['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_partial_rowwise_adam.cpp'],['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_partial_rowwise_lamb.cpp'],['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_partial_rowwise_lamb.cpp'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_rowwise_adagrad.cpp'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_rowwise_adagrad.cpp'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_rowwise_adagrad_with_counter.cpp'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_rowwise_adagrad_with_counter.cpp'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_rowwise_adagrad_with_weight_decay.cpp'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_rowwise_adagrad_with_weight_decay.cpp'],['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_rowwise_weighted_adagrad.cpp'],['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_rowwise_weighted_adagrad.cpp'],['../gen__embedding__backward__split__sgd_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_sgd.cpp'],['../gen__embedding__backward__split__sgd_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_sgd.cpp'],['../gen__embedding__forward__dense__unweighted__codegen__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_dense_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__dense__unweighted__codegen__meta_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_dense_unweighted_codegen_meta.cpp'],['../gen__embedding__forward__dense__weighted__codegen__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_dense_weighted_codegen_cuda.cu'],['../gen__embedding__forward__dense__weighted__codegen__meta_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_dense_weighted_codegen_meta.cpp'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__codegen__meta_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_split_unweighted_codegen_meta.cpp'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__meta_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_split_unweighted_vbe_codegen_meta.cpp'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__meta_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_split_weighted_codegen_meta.cpp'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__meta_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_split_weighted_vbe_codegen_meta.cpp'],['../gen__embedding__optimizer__rowwise__adagrad__split_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_optimizer_rowwise_adagrad_split.cpp'],['../batch__index__select__dim0__cpu__host_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): batch_index_select_dim0_cpu_host.cpp'],['../batch__index__select__dim0__cpu__host_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): batch_index_select_dim0_cpu_host.cpp'],['../batch__index__select__dim0__host_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): batch_index_select_dim0_host.cpp'],['../batch__index__select__dim0__host_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): batch_index_select_dim0_host.cpp'],['../embedding__backward__dense__host_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): embedding_backward_dense_host.cpp'],['../embedding__backward__dense__host_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): embedding_backward_dense_host.cpp'],['../embedding__bounds__check__host_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): embedding_bounds_check_host.cpp'],['../embedding__bounds__check__host_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): embedding_bounds_check_host.cpp'],['../embedding__bounds__check__host__cpu_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): embedding_bounds_check_host_cpu.cpp'],['../embedding__bounds__check__host__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): embedding_bounds_check_host_cpu.cpp'],['../embedding__forward__quantized__host_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): embedding_forward_quantized_host.cpp'],['../embedding__forward__quantized__host__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): embedding_forward_quantized_host_cpu.cpp'],['../embedding__optimizer__split__host__template_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): embedding_optimizer_split_host_template.cpp'],['../embedding__inplace__update__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): embedding_inplace_update_cpu.cpp'],['../embedding__inplace__update__gpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): embedding_inplace_update_gpu.cpp'],['../input__combine__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): input_combine_cpu.cpp'],['../jagged__tensor__ops__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): jagged_tensor_ops_cpu.cpp'],['../layout__transform__ops__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): layout_transform_ops_cpu.cpp'],['../namespacefbgemm__gpu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'fbgemm_gpu::TORCH_LIBRARY_FRAGMENT()'],['../merge__pooled__embedding__ops__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): merge_pooled_embedding_ops_cpu.cpp'],['../merge__pooled__embedding__ops__gpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): merge_pooled_embedding_ops_gpu.cpp'],['../permute__pooled__embedding__ops__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): permute_pooled_embedding_ops_cpu.cpp'],['../permute__pooled__embedding__ops__gpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): permute_pooled_embedding_ops_gpu.cpp'],['../permute__pooled__embedding__ops__split__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): permute_pooled_embedding_ops_split_cpu.cpp'],['../permute__pooled__embedding__ops__split__gpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): permute_pooled_embedding_ops_split_gpu.cpp'],['../quantize__ops__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): quantize_ops_cpu.cpp'],['../sparse__ops__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): sparse_ops_cpu.cpp'],['../sparse__ops__gpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): sparse_ops_gpu.cpp'],['../sparse__zipf_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): sparse_zipf.cu'],['../split__embeddings__utils_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): split_embeddings_utils.cpp']]], + ['torch_5flibrary_5fimpl_52',['TORCH_LIBRARY_IMPL',['../namespacefbgemm__gpu.html#a257a9d9e0a71b3a1299af6ef9c6c3a78',1,'fbgemm_gpu::TORCH_LIBRARY_IMPL()'],['../jagged__tensor__ops__autograd_8cpp.html#a89761ba0ed893bf88bdfdd1f6d15bc65',1,'TORCH_LIBRARY_IMPL(fbgemm, Autograd, m): jagged_tensor_ops_autograd.cpp'],['../jagged__tensor__ops__autograd_8cpp.html#a5eca359a14102dd9fcab1f8e80594472',1,'TORCH_LIBRARY_IMPL(fbgemm, CompositeImplicitAutograd, m): jagged_tensor_ops_autograd.cpp'],['../jagged__tensor__ops__cpu_8cpp.html#a26b96ceaa00c9be7dbba99ca0b772a58',1,'TORCH_LIBRARY_IMPL(fbgemm, CPU, m): jagged_tensor_ops_cpu.cpp'],['../jagged__tensor__ops__cpu_8cpp.html#aa138561d0eb99d73b2bf9586b84e7c46',1,'TORCH_LIBRARY_IMPL(fbgemm, CompositeExplicitAutograd, m): jagged_tensor_ops_cpu.cpp'],['../jagged__tensor__ops__meta_8cpp.html#a5a1490b57e6f9b7f7f7b12c0359a2f91',1,'TORCH_LIBRARY_IMPL(fbgemm, Meta, m): jagged_tensor_ops_meta.cpp'],['../layout__transform__ops__cpu_8cpp.html#a26b96ceaa00c9be7dbba99ca0b772a58',1,'TORCH_LIBRARY_IMPL(fbgemm, CPU, m): layout_transform_ops_cpu.cpp'],['../layout__transform__ops__gpu_8cpp.html#a257a9d9e0a71b3a1299af6ef9c6c3a78',1,'TORCH_LIBRARY_IMPL(fbgemm, CUDA, m): layout_transform_ops_gpu.cpp'],['../quantize__ops__cpu_8cpp.html#a26b96ceaa00c9be7dbba99ca0b772a58',1,'TORCH_LIBRARY_IMPL(fbgemm, CPU, m): quantize_ops_cpu.cpp'],['../quantize__ops__meta_8cpp.html#a5a1490b57e6f9b7f7f7b12c0359a2f91',1,'TORCH_LIBRARY_IMPL(fbgemm, Meta, m): quantize_ops_meta.cpp'],['../sparse__ops__cpu_8cpp.html#a26b96ceaa00c9be7dbba99ca0b772a58',1,'TORCH_LIBRARY_IMPL(fbgemm, CPU, m): sparse_ops_cpu.cpp'],['../sparse__ops__cpu_8cpp.html#a89761ba0ed893bf88bdfdd1f6d15bc65',1,'TORCH_LIBRARY_IMPL(fbgemm, Autograd, m): sparse_ops_cpu.cpp'],['../sparse__ops__cpu_8cpp.html#af0fdef89a7a61f1f510ed4bb5f6d5398',1,'TORCH_LIBRARY_IMPL(fbgemm, AutogradCPU, m): sparse_ops_cpu.cpp'],['../sparse__ops__cpu_8cpp.html#a5a1490b57e6f9b7f7f7b12c0359a2f91',1,'TORCH_LIBRARY_IMPL(fbgemm, Meta, m): sparse_ops_cpu.cpp'],['../sparse__ops__gpu_8cpp.html#a257a9d9e0a71b3a1299af6ef9c6c3a78',1,'TORCH_LIBRARY_IMPL(fbgemm, CUDA, m): sparse_ops_gpu.cpp'],['../sparse__ops__gpu_8cpp.html#a5a1490b57e6f9b7f7f7b12c0359a2f91',1,'TORCH_LIBRARY_IMPL(fbgemm, Meta, m): sparse_ops_gpu.cpp'],['../sparse__ops__gpu_8cpp.html#a8fd406590cd83f4dec4a63c7c1b9ce78',1,'TORCH_LIBRARY_IMPL(fbgemm, AutogradCUDA, m): sparse_ops_gpu.cpp'],['../sparse__ops__meta_8cpp.html#a5a1490b57e6f9b7f7f7b12c0359a2f91',1,'TORCH_LIBRARY_IMPL(fbgemm, Meta, m): sparse_ops_meta.cpp'],['../split__embeddings__utils_8cpp.html#a5a1490b57e6f9b7f7f7b12c0359a2f91',1,'TORCH_LIBRARY_IMPL(fbgemm, Meta, m): split_embeddings_utils.cpp']]], + ['torch_5ftensor_5fdevice_5fname_53',['torch_tensor_device_name',['../sparse__ops__utils_8h.html#a535403fdc5c523b45f0d56d657e17f7b',1,'torch_tensor_device_name(const at::Tensor &ten): sparse_ops_utils.h'],['../sparse__ops__utils_8h.html#a319c921d3abe8bdb14140b45afe9afdb',1,'torch_tensor_device_name(const c10::optional< at::Tensor > &ten): sparse_ops_utils.h']]], + ['torch_5ftensor_5fempty_5for_5fon_5fcpu_5fcheck_54',['torch_tensor_empty_or_on_cpu_check',['../sparse__ops__utils_8h.html#a6328f240dd58293d0349471dca28797e',1,'torch_tensor_empty_or_on_cpu_check(const at::Tensor &ten): sparse_ops_utils.h'],['../sparse__ops__utils_8h.html#afc4520e447e8ad48a316af75860d84ae',1,'torch_tensor_empty_or_on_cpu_check(const c10::optional< at::Tensor > &ten): sparse_ops_utils.h']]], + ['torch_5ftensor_5fempty_5for_5fon_5fcuda_5fgpu_5fcheck_55',['torch_tensor_empty_or_on_cuda_gpu_check',['../sparse__ops__utils_8h.html#abb9778e9fb75a70593c27e53dca268cd',1,'torch_tensor_empty_or_on_cuda_gpu_check(const at::Tensor &ten): sparse_ops_utils.h'],['../sparse__ops__utils_8h.html#aac863615b6eba91282fcf07b5e9a5460',1,'torch_tensor_empty_or_on_cuda_gpu_check(const c10::optional< at::Tensor > &ten): sparse_ops_utils.h']]], + ['torch_5ftensor_5fon_5fcpu_5fcheck_56',['torch_tensor_on_cpu_check',['../sparse__ops__utils_8h.html#ad971d56f6b82b6c62a2d6fed276b0463',1,'torch_tensor_on_cpu_check(const at::Tensor &ten): sparse_ops_utils.h'],['../sparse__ops__utils_8h.html#af4afd1e331412cf092a70d0fd816aed8',1,'torch_tensor_on_cpu_check(const c10::optional< at::Tensor > &ten): sparse_ops_utils.h']]], + ['torch_5ftensor_5fon_5fcuda_5fgpu_5fcheck_57',['torch_tensor_on_cuda_gpu_check',['../sparse__ops__utils_8h.html#a5568d44e6066339da1326798f9637b16',1,'torch_tensor_on_cuda_gpu_check(const at::Tensor &ten): sparse_ops_utils.h'],['../sparse__ops__utils_8h.html#a99211623695fce2a359b74a5823b58b8',1,'torch_tensor_on_cuda_gpu_check(const c10::optional< at::Tensor > &ten): sparse_ops_utils.h']]], + ['torch_5ftensor_5fon_5fsame_5fdevice_5fcheck_58',['torch_tensor_on_same_device_check',['../sparse__ops__utils_8h.html#a5683dd4c2143c3c0ba0eeb80fd5223f0',1,'torch_tensor_on_same_device_check(const at::Tensor &ten1, const at::Tensor &ten2): sparse_ops_utils.h'],['../sparse__ops__utils_8h.html#ac60c66ce5a4058e4906907960f82f1be',1,'torch_tensor_on_same_device_check(const at::Tensor &ten1, const c10::optional< at::Tensor > &ten2): sparse_ops_utils.h']]], + ['torch_5ftensor_5fundefined_59',['torch_tensor_undefined',['../sparse__ops__utils_8h.html#ab583553d9bf8ca92fadb8a81ffd40cd8',1,'torch_tensor_undefined(const at::Tensor &ten): sparse_ops_utils.h'],['../sparse__ops__utils_8h.html#a5e916ca6a05a17d36e5341d929cc18e0',1,'torch_tensor_undefined(const c10::optional< at::Tensor > &ten): sparse_ops_utils.h']]], + ['total_5fl_60',['total_L',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aecbb8e032512c651d4a4d6c76c201528',1,'total_L: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aecbb8e032512c651d4a4d6c76c201528',1,'total_L: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['total_5fl_5foffsets_61',['total_L_offsets',['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html#aa162b310777fc83fbde6ed5d0d35df4c',1,'gen_batch_index_select_dim0_forward_kernel_small.cu']]], + ['total_5fload_5fd_62',['total_load_D',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a52ea0aaf4b80b614a42c9d62c2b17730',1,'total_load_D: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a52ea0aaf4b80b614a42c9d62c2b17730',1,'total_load_D: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['transformation_20cpu_20operators_63',['Layout Transformation CPU Operators',['../group__layout-transform-cpu.html',1,'']]], + ['transformation_20cuda_20operators_64',['Layout Transformation CUDA Operators',['../group__layout-transform-cuda.html',1,'']]], + ['transpose_65',['transpose',['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor.html#aa4aba7637a10c7b8b839ef27952e855d',1,'fbgemm_gpu::GenericPackedTensorAccessor::transpose()'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html#a0ed7d1e6f585332c781fc568e1fad1ac',1,'fbgemm_gpu::GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >::transpose()']]], + ['transpose_5fembedding_5finput_66',['transpose_embedding_input',['../split__embeddings__utils_8cuh.html#a508f832d3fec529868cbb1f9fa9defc8',1,'transpose_embedding_input(at::Tensor hash_size_cumsum, int64_t total_hash_size_bits, at::Tensor indices, at::Tensor offsets, bool nobag=false, const c10::optional< at::Tensor > &vbe_b_t_map=c10::optional< at::Tensor >(), const int64_t info_B_num_bits=26, const int64_t info_B_mask=0x2FFFFFF, const int64_t total_unique_indices=-1, const bool is_index_select=false, const c10::optional< at::Tensor > &total_L_offsets=c10::optional< at::Tensor >(), const int64_t fixed_L_per_warp=0, const int64_t num_warps_per_feature=0): split_embeddings_utils.cuh'],['../transpose__embedding__input_8cu.html#a569a769e3233130cce363d9ae151bd26',1,'transpose_embedding_input(Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, bool nobag, const c10::optional< Tensor > &vbe_b_t_map, const int64_t info_B_num_bits, const int64_t info_B_mask, const int64_t total_unique_indices, const bool is_index_select, const c10::optional< Tensor > &total_L_offsets, const int64_t fixed_L_per_warp, const int64_t num_warps_per_feature): transpose_embedding_input.cu']]], + ['transpose_5fembedding_5finput_2ecu_67',['transpose_embedding_input.cu',['../transpose__embedding__input_8cu.html',1,'']]], + ['trapz_5fkernel_68',['trapz_kernel',['../namespacefbgemm__gpu.html#a45142e19fe831c9d085bb097b7d946b2',1,'fbgemm_gpu']]], + ['true_69',['true',['../gen__embedding__forward__split__unweighted__kernel_8cu.html#acc5baa8672e7ddf3cefb150e4660d86a',1,'true: gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#acc5baa8672e7ddf3cefb150e4660d86a',1,'true: gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#acc5baa8672e7ddf3cefb150e4660d86a',1,'true: gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#acc5baa8672e7ddf3cefb150e4660d86a',1,'true: gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#acc5baa8672e7ddf3cefb150e4660d86a',1,'true: gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__backward__split__grad_8cu.html#af0ccb06b8169682c123d1399ed8e1869',1,'true(const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > dev_or_uvm_unique_indices, const int info_B_num_bits): gen_embedding_backward_split_grad.cu'],['../namespacenbit.html#ae298c42e84018c608c72200f61270827',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a9233b0f37aec7890155371e3f1f8a4c6',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#ad461b37bcc67ce85965ea3d63318b609',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a6610e53a686bcaa7c0c055493223b286',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#aa0e536c5986677aa5c753d497c9ec6ea',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a744a48f6ba12a807eed65323fac0d7b9',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#aa7f9e825cb23814721fa128e75fd54df',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#af5e4b89707ccb6db711f4b214120f6d4',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#ae950ec6b1a6c8e70896ceea8585e8a94',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a80b1856aa5c50bef02b6cfc6e07a738f',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a5190453e12b3ae3d90ccbad2d0fd3366',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#aeab80be016250076834edd018371fadc',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#ac4473fe74a275df878cef6094b97142f',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a5e7304badb9669f2af28007bc9faa533',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a000a2e8569876d491d4d9578f5bca2fb',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a1b561270c0c573adbb9b099b20a3ca71',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#aa4e2b761fd2635bd5d972c84f9e28837',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#aea0485b6b1bbf758999bd85f6affc052',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#ab843cff102b60ffbfb639c2371b90f7b',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a085775b780406668fe81c55a30eb3098',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a5614c839b9baa44dd6962fe11a148918',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#af580fa47263724bff70ce910764bea41',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#acb54005a5872970a6721deca8ff5cd99',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a7d2686b58c584f889807ad3902056eac',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a06d07c66722a850f758f54932d3dbe17',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a1aa60c6099666e18389fa1e982910986',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a8de160ae737c50e86160493247817870',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa74dcf7a765d22c0b1ec49310c9a04b3',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a5649e552b4b7bb69095114018ba395fb',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a9ec2bf37e5db917feed838745ed81985',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a16cf98f36e41cdcacdb6dabac0b258e0',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#ab6f0a4b5648537896b38264e4d38f9aa',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a1e7e3a44299ea276cb2e5f5082977777',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a3d5bd72dd7f6e6c6b0a50b2070e74f45',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#af605abd85d3cc9e6dca40ea687104f6e',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a9c2ae1d1bfa19b2caadbc8e76c32697c',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa41a6064cb3571ecd43c9da816216785',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a66d27435490ba7673e7362fca9cc8f7e',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ad88bb49652d4d156c75abb8ca2419542',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a0a28fe8dcfa38da6241b67d3ec3e4ff2',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a09d02507a5cf390975fafa6a5c7096e8',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa93f293dcfd38afcd57776f33ceb8490',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ab59b0abf8963d48e63c90334daea4fc5',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ab80c4590dcdff94d23d4f89f1c7e0039',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#addc9e8fb4cd569b143bff818ca6e068b',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aca3f7571841f3f5e46e703a210f5ef3d',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#afeba51154f1a22327b47305480f43671',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a4a3bc2db616d7f8f845d8e0cd092fd56',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['two_5fto_5fe_70',['two_to_e',['../verify__fp16__stochastic__benchmark_8cu.html#ab29b4915253bcafe11f5d95cfb227c0b',1,'verify_fp16_stochastic_benchmark.cu']]], + ['type_71',['type',['../struct_vec4_type_3_01float_01_4.html#aef2d7a9710bd35cfd4161c950176220e',1,'Vec4Type< float >::type'],['../struct_vec4_type_3_01at_1_1_half_01_4.html#af96b1e07047414416d113699f4285a02',1,'Vec4Type< at::Half >::type'],['../struct_vec4_type_3_01uint8__t_01_4.html#aeeb5ec644b58a782b9dbaa98b3475cad',1,'Vec4Type< uint8_t >::type'],['../struct_vec4_type_3_01float_01_4.html#aef2d7a9710bd35cfd4161c950176220e',1,'Vec4Type< float >::type'],['../struct_vec4_type_3_01at_1_1_half_01_4.html#af96b1e07047414416d113699f4285a02',1,'Vec4Type< at::Half >::type'],['../struct_vec4_type_3_01uint8__t_01_4.html#aeeb5ec644b58a782b9dbaa98b3475cad',1,'Vec4Type< uint8_t >::type'],['../struct_vec4_type_3_01float_01_4.html#aef2d7a9710bd35cfd4161c950176220e',1,'Vec4Type< float >::type'],['../struct_vec4_type_3_01at_1_1_half_01_4.html#af96b1e07047414416d113699f4285a02',1,'Vec4Type< at::Half >::type'],['../struct_vec4_type_3_01uint8__t_01_4.html#aeeb5ec644b58a782b9dbaa98b3475cad',1,'Vec4Type< uint8_t >::type']]] +]; diff --git a/search/all_15.js b/search/all_15.js new file mode 100644 index 000000000..b2cfb5921 --- /dev/null +++ b/search/all_15.js @@ -0,0 +1,21 @@ +var searchData= +[ + ['uint32_5ft_0',['uint32_t',['../gen__embedding__backward__split__grad_8cu.html#abe53421bcec0b67763c3ed41e3a2a2ad',1,'gen_embedding_backward_split_grad.cu']]], + ['uint8_5ft_1',['uint8_t',['../gen__batch__index__select__dim0__forward__kernel_8cu.html#a1360e7840ee58417b26bf9445f94c59d',1,'uint8_t: gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html#a1360e7840ee58417b26bf9445f94c59d',1,'uint8_t: gen_batch_index_select_dim0_forward_kernel_small.cu'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#a1360e7840ee58417b26bf9445f94c59d',1,'uint8_t: gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#a1360e7840ee58417b26bf9445f94c59d',1,'uint8_t: gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html#a1360e7840ee58417b26bf9445f94c59d',1,'uint8_t: gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#a1360e7840ee58417b26bf9445f94c59d',1,'uint8_t: gen_embedding_forward_dense_weighted_kernel.cu'],['../namespacenbit.html#a1360e7840ee58417b26bf9445f94c59d',1,'nbit::uint8_t'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a1360e7840ee58417b26bf9445f94c59d',1,'uint8_t: gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a1360e7840ee58417b26bf9445f94c59d',1,'uint8_t: gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html#a1360e7840ee58417b26bf9445f94c59d',1,'uint8_t: gen_embedding_forward_split_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a1360e7840ee58417b26bf9445f94c59d',1,'uint8_t: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a1360e7840ee58417b26bf9445f94c59d',1,'uint8_t: gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#a1360e7840ee58417b26bf9445f94c59d',1,'uint8_t: gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a1360e7840ee58417b26bf9445f94c59d',1,'uint8_t: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a1360e7840ee58417b26bf9445f94c59d',1,'uint8_t: gen_embedding_forward_split_weighted_vbe_kernel.cu']]], + ['unbucketize_5fpermute_5fdata_2',['unbucketize_permute_data',['../namespacefbgemm__gpu.html#aa403c596f919b42af361fc6554cce9e0',1,'fbgemm_gpu']]], + ['uncalibrated_3',['uncalibrated',['../namespacefbgemm__gpu.html#a7b13aa0c4501d0593484a73afe8786c2',1,'fbgemm_gpu']]], + ['unpack_5fsegments_5fcuda_5fkernel_4',['unpack_segments_cuda_kernel',['../namespacefbgemm__gpu.html#a0ca17769ee2a4593b447a78e3d3fe429',1,'fbgemm_gpu']]], + ['unpadded_5frow_5fsize_5fin_5fbytes_5',['unpadded_row_size_in_bytes',['../namespacenbit.html#a7654c0df9e54aa58c35fe39c53130cbc',1,'nbit']]], + ['uoffset_5ft_6',['uoffset_t',['../namespacefbgemm__gpu.html#ae8406b85b19117866badffef9481f3e2',1,'fbgemm_gpu']]], + ['use_5flxu_5fcache_7',['use_lxu_cache',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa7b80f1189d1874ab861378ed299a21e',1,'use_lxu_cache: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa7b80f1189d1874ab861378ed299a21e',1,'use_lxu_cache: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['uvm_5fcache_5fmiss_5femulate_5ftest_2ecpp_8',['uvm_cache_miss_emulate_test.cpp',['../uvm__cache__miss__emulate__test_8cpp.html',1,'']]], + ['uvm_5fcache_5fstats_5findex_9',['uvm_cache_stats_index',['../namespacefbgemm__gpu.html#aefeeb0d13ba9b557b8d693c43e5a43aa',1,'fbgemm_gpu']]], + ['uvm_5fcuda_5fmem_5fadvise_10',['uvm_cuda_mem_advise',['../group__cumem-utils.html#gae8c724e90d31245756fc4b0d975f9370',1,'fbgemm_gpu']]], + ['uvm_5fcuda_5fmem_5fprefetch_5fasync_11',['uvm_cuda_mem_prefetch_async',['../group__cumem-utils.html#gaf060db44e71e3419df6e596614ef2081',1,'fbgemm_gpu']]], + ['uvm_5fmem_5fadvice_5fdont_5ffork_12',['uvm_mem_advice_dont_fork',['../group__cumem-utils.html#ga01301ad686f7570c21e81c122d2c7af8',1,'fbgemm_gpu']]], + ['uvm_5fstorage_13',['uvm_storage',['../group__cumem-utils.html#ga05bf2c435c434904ca454c6992861cb6',1,'fbgemm_gpu']]], + ['uvm_5fto_5fcpu_14',['uvm_to_cpu',['../group__cumem-utils.html#gab5a3dab831988b1ce368ccc545b75b48',1,'fbgemm_gpu']]], + ['uvm_5fto_5fcpu_5fclone_15',['uvm_to_cpu_clone',['../group__cumem-utils.html#ga161495e682d9eac3701dca87469930db',1,'fbgemm_gpu']]], + ['uvm_5fto_5fdevice_16',['uvm_to_device',['../group__cumem-utils.html#gaebfedcf8e6017a6d4f6fb16b52c4c04e',1,'fbgemm_gpu']]], + ['uvm_5fweights_17',['uvm_weights',['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#a17f61eb7bf7a7e4089982fbf69116da5',1,'uvm_weights: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html#a45c4a9176e8f636d292288647fdeff77',1,'uvm_weights: gen_embedding_forward_split_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a77d267b92511473228e629909dcb8a07',1,'uvm_weights: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#a653cbc621a5959ad8f3951a92154c616',1,'uvm_weights: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#a77d267b92511473228e629909dcb8a07',1,'uvm_weights: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a77d267b92511473228e629909dcb8a07',1,'uvm_weights: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#a77d267b92511473228e629909dcb8a07',1,'uvm_weights: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#a02abd4b4f2f2745d8c6e8c696d70c025',1,'uvm_weights: gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]] +]; diff --git a/search/all_16.js b/search/all_16.js new file mode 100644 index 000000000..4299d251d --- /dev/null +++ b/search/all_16.js @@ -0,0 +1,36 @@ +var searchData= +[ + ['val_0',['val',['../bench__utils_8cuh.html#a0f88d66987f307f00e5868889c52df87',1,'val: bench_utils.cuh'],['../jagged__tensor__ops_2common_8cuh.html#a34f893695235597b772faca329b14963',1,'val: common.cuh']]], + ['vals_1',['vals',['../struct_stack_array.html#a9f80f8c0a4403726aa06af2340127ce3',1,'StackArray']]], + ['value_2',['value',['../structlog2__calc__.html#a06fc87d81c62e9abb8790b6e5713c55ba97de9ab6885342a574053b8f64a563a9',1,'log2_calc_::value'],['../structlog2__calc___3_010_01_4.html#adf764cbdea00d65edcd07bb9953ad2b7a97de9ab6885342a574053b8f64a563a9',1,'log2_calc_< 0 >::value'],['../structlog2__calc.html#a99fb83031ce9923c84392b4e92f956b5a97de9ab6885342a574053b8f64a563a9',1,'log2_calc::value']]], + ['values_5fdata_3',['values_data',['../namespacefbgemm__gpu.html#af7acf47e01ed08917ef22330aaa8f95d',1,'fbgemm_gpu']]], + ['vec4_5facc_4',['vec4_acc',['../namespacefbgemm__gpu.html#ab2a027e4907e39797b913faa6b4e7270',1,'fbgemm_gpu']]], + ['vec4_5fmax_5',['vec4_max',['../namespacefbgemm__gpu.html#a635410cfe229b71efb90199b72107f86',1,'fbgemm_gpu']]], + ['vec4_5fmin_6',['vec4_min',['../namespacefbgemm__gpu.html#ae8a02a5464fb9156400157b45a947c58',1,'fbgemm_gpu']]], + ['vec4_5ftype_7',['vec4_type',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a58da2e6e124bd5725ddbf144b36921f5',1,'vec4_type: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a58da2e6e124bd5725ddbf144b36921f5',1,'vec4_type: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a58da2e6e124bd5725ddbf144b36921f5',1,'vec4_type: embedding_forward_split_kernel_v2_template.cu']]], + ['vec4acct_8',['Vec4AccT',['../structfbgemm__gpu_1_1_vec4_acc_t.html',1,'Vec4AccT'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#a7d2508ce413d52826f32884f52ad2f90',1,'fbgemm_gpu::Vec4AccT::Vec4AccT()']]], + ['vec4stept_9',['Vec4StepT',['../structfbgemm__gpu_1_1_vec4_step_t.html',1,'Vec4StepT< STEP, input_t >'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#a6d2826b97c8d5f17a31ed7e7854615ad',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::Vec4StepT()']]], + ['vec4stept_3c_20step_2c_20at_3a_3ahalf_20_3e_10',['Vec4StepT< STEP, at::Half >',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html',1,'fbgemm_gpu']]], + ['vec4stept_3c_20step_2c_20float_20_3e_11',['Vec4StepT< STEP, float >',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html',1,'fbgemm_gpu']]], + ['vec4stept_3c_20step_2c_20uint8_5ft_20_3e_12',['Vec4StepT< STEP, uint8_t >',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html',1,'fbgemm_gpu']]], + ['vec4t_13',['Vec4T',['../structfbgemm__gpu_1_1_vec4_t.html',1,'Vec4T< T >'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a2cd51fbd0d3886a28acea0b4f47ca118',1,'fbgemm_gpu::Vec4T< float >::Vec4T()'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#ae39dfa9a228f8ce23816438c9bdab827',1,'fbgemm_gpu::Vec4T< float >::Vec4T(const float *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#adadc08c2f27a9f6dfa8993ec8948cc65',1,'fbgemm_gpu::Vec4T< float >::Vec4T(const double *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a5567e55ab954640ee5bb6204c4fcf75b',1,'fbgemm_gpu::Vec4T< float >::Vec4T(const at::Half *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a6e80eaeff7fa50dc31b3426b7cbdf919',1,'fbgemm_gpu::Vec4T< float >::Vec4T(const at::BFloat16 *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a2cd51fbd0d3886a28acea0b4f47ca118',1,'fbgemm_gpu::Vec4T< at::Half >::Vec4T()'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a5567e55ab954640ee5bb6204c4fcf75b',1,'fbgemm_gpu::Vec4T< at::Half >::Vec4T(const at::Half *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a6e80eaeff7fa50dc31b3426b7cbdf919',1,'fbgemm_gpu::Vec4T< at::Half >::Vec4T(const at::BFloat16 *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#ae39dfa9a228f8ce23816438c9bdab827',1,'fbgemm_gpu::Vec4T< at::Half >::Vec4T(const float *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#adadc08c2f27a9f6dfa8993ec8948cc65',1,'fbgemm_gpu::Vec4T< at::Half >::Vec4T(const double *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a2cd51fbd0d3886a28acea0b4f47ca118',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::Vec4T()'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a6e80eaeff7fa50dc31b3426b7cbdf919',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::Vec4T(const at::BFloat16 *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a5567e55ab954640ee5bb6204c4fcf75b',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::Vec4T(const at::Half *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#ae39dfa9a228f8ce23816438c9bdab827',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::Vec4T(const float *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#adadc08c2f27a9f6dfa8993ec8948cc65',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::Vec4T(const double *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a2cd51fbd0d3886a28acea0b4f47ca118',1,'fbgemm_gpu::Vec4T< double >::Vec4T()'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a5567e55ab954640ee5bb6204c4fcf75b',1,'fbgemm_gpu::Vec4T< double >::Vec4T(const at::Half *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a6e80eaeff7fa50dc31b3426b7cbdf919',1,'fbgemm_gpu::Vec4T< double >::Vec4T(const at::BFloat16 *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#ae39dfa9a228f8ce23816438c9bdab827',1,'fbgemm_gpu::Vec4T< double >::Vec4T(const float *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#adadc08c2f27a9f6dfa8993ec8948cc65',1,'fbgemm_gpu::Vec4T< double >::Vec4T(const double *p)']]], + ['vec4t_3c_20at_3a_3abfloat16_20_3e_14',['Vec4T< at::BFloat16 >',['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html',1,'fbgemm_gpu']]], + ['vec4t_3c_20at_3a_3ahalf_20_3e_15',['Vec4T< at::Half >',['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html',1,'fbgemm_gpu']]], + ['vec4t_3c_20double_20_3e_16',['Vec4T< double >',['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html',1,'fbgemm_gpu']]], + ['vec4t_3c_20float_20_3e_17',['Vec4T< float >',['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html',1,'fbgemm_gpu']]], + ['vec4type_18',['Vec4Type',['../struct_vec4_type.html',1,'']]], + ['vec4type_3c_20at_3a_3ahalf_20_3e_19',['Vec4Type< at::Half >',['../struct_vec4_type_3_01at_1_1_half_01_4.html',1,'']]], + ['vec4type_3c_20float_20_3e_20',['Vec4Type< float >',['../struct_vec4_type_3_01float_01_4.html',1,'']]], + ['vec4type_3c_20uint8_5ft_20_3e_21',['Vec4Type< uint8_t >',['../struct_vec4_type_3_01uint8__t_01_4.html',1,'']]], + ['vec_5fcopy_5fwith_5fimplicit_5ftype_5fcast_22',['vec_copy_with_implicit_type_cast',['../namespacefbgemm__gpu.html#a8c639f9912105390e4083332e01ecc57',1,'fbgemm_gpu']]], + ['vec_5fcopy_5fwith_5fimplicit_5ftype_5fcast_3c_20int64_5ft_2c_20int32_5ft_2c_20vec_5fwidth_20_3e_23',['vec_copy_with_implicit_type_cast< int64_t, int32_t, VEC_WIDTH >',['../namespacefbgemm__gpu.html#a6b717a692f34f1bc7afb9eec6d5f9a2e',1,'fbgemm_gpu']]], + ['vec_5fwidth_24',['VEC_WIDTH',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#af57bf37dbd6a53004f468edeb5020860',1,'VEC_WIDTH: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#af57bf37dbd6a53004f468edeb5020860',1,'VEC_WIDTH: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#af57bf37dbd6a53004f468edeb5020860',1,'VEC_WIDTH: embedding_forward_split_kernel_v2_template.cu'],['../namespacefbgemm__gpu.html#a14fea42ceabd6ac042ad0d2fe5452762',1,'fbgemm_gpu::VEC_WIDTH(combined_indices, indices_addrs[list_id], src_idx, indices_start+src_idx, indices_end - indices_start)'],['../namespacefbgemm__gpu.html#a5aef253d76748f681c0e5d7e1620c8c9',1,'fbgemm_gpu::VEC_WIDTH(combined_lengths, lengths_addrs[list_id], src_idx, lengths_start+src_idx, lengths_end - lengths_start)']]], + ['vecnt_25',['VecNT',['../structfbgemm__gpu_1_1_vec_n_t.html',1,'VecNT< N, PrimitiveType >'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#af44b6695d2ac77093130f394c322417d',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::VecNT()'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#ac774386ebb8ac7021a221b0d32041e40',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::VecNT(float a)'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#af44b6695d2ac77093130f394c322417d',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::VecNT()'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a762e9c277918a40b3e1577984507b77d',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::VecNT(half2 a)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#af44b6695d2ac77093130f394c322417d',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::VecNT()'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#ae4b5f2ee834300f0c91a1e1f247b56a5',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::VecNT(uint32_t v, const int exp_bits, const int exp_bias)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#af44b6695d2ac77093130f394c322417d',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::VecNT()'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#af7d39695d99328f4f6e8faf36a115e94',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::VecNT(uint32_t v, half2 shift_scale)'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#af44b6695d2ac77093130f394c322417d',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::VecNT()'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#af7d39695d99328f4f6e8faf36a115e94',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::VecNT(uint32_t v, half2 shift_scale)'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#af44b6695d2ac77093130f394c322417d',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::VecNT()'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#af7d39695d99328f4f6e8faf36a115e94',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::VecNT(uint32_t v, half2 shift_scale)']]], + ['vecnt_3c_201_2c_20primitivetype_3a_3afp_20_3e_26',['VecNT< 1, PrimitiveType::FP >',['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html',1,'fbgemm_gpu']]], + ['vecnt_3c_2016_2c_20primitivetype_3a_3aint_20_3e_27',['VecNT< 16, PrimitiveType::INT >',['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html',1,'fbgemm_gpu']]], + ['vecnt_3c_202_2c_20primitivetype_3a_3afp_20_3e_28',['VecNT< 2, PrimitiveType::FP >',['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html',1,'fbgemm_gpu']]], + ['vecnt_3c_204_2c_20primitivetype_3a_3afp_20_3e_29',['VecNT< 4, PrimitiveType::FP >',['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html',1,'fbgemm_gpu']]], + ['vecnt_3c_204_2c_20primitivetype_3a_3aint_20_3e_30',['VecNT< 4, PrimitiveType::INT >',['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html',1,'fbgemm_gpu']]], + ['vecnt_3c_208_2c_20primitivetype_3a_3aint_20_3e_31',['VecNT< 8, PrimitiveType::INT >',['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html',1,'fbgemm_gpu']]], + ['verify_5ffp16_5fstochastic_5fbenchmark_2ecu_32',['verify_fp16_stochastic_benchmark.cu',['../verify__fp16__stochastic__benchmark_8cu.html',1,'']]] +]; diff --git a/search/all_17.js b/search/all_17.js new file mode 100644 index 000000000..87954c22c --- /dev/null +++ b/search/all_17.js @@ -0,0 +1,25 @@ +var searchData= +[ + ['warning_0',['WARNING',['../namespacefbgemm__gpu.html#a70433200cf584e2429434a33d45111eaa059e9861e0400dfbe05c98a841f3f96b',1,'fbgemm_gpu']]], + ['warp_5fcopy_5fto_1',['warp_copy_to',['../structfbgemm__gpu_1_1_weight_row.html#a4a0da3213c0d4a99586cbe6e6ec72107',1,'fbgemm_gpu::WeightRow']]], + ['warp_5fevict_2',['warp_evict',['../structfbgemm__gpu_1_1_weight_row.html#ae00ddf1640cea584b79618dfd69d91d2',1,'fbgemm_gpu::WeightRow']]], + ['warp_5ffind_5fqparams_3',['warp_find_qparams',['../namespacefbgemm__gpu.html#a78a26de691da2f45a0e4ddaeda75009d',1,'fbgemm_gpu']]], + ['warp_5fid_4',['warp_id',['../namespacefbgemm__gpu.html#a039dca4bc32e9ad20122b5855542e292',1,'fbgemm_gpu']]], + ['warp_5foffsets_5fgroup_5',['warp_offsets_group',['../namespacefbgemm__gpu.html#aecfb31f7c9583dd16ed7463ad8328db4',1,'fbgemm_gpu']]], + ['warp_5freduce_5fmax_6',['warp_reduce_max',['../namespacefbgemm__gpu.html#acddba9c219634f979df1c8b943ac5e88',1,'fbgemm_gpu']]], + ['warp_5freduce_5fmin_7',['warp_reduce_min',['../namespacefbgemm__gpu.html#af554571b877e978f495835af1920f4fb',1,'fbgemm_gpu']]], + ['warpbitonicmergele16_8',['warpBitonicMergeLE16',['../namespacefbgemm__gpu.html#a9bd92b10074adc4fc58e4671a1d1d576',1,'fbgemm_gpu']]], + ['warpreduceallsum_9',['warpReduceAllSum',['../namespacefbgemm__gpu.html#ad47dc8c3cfd941ea7a92b1cb677abf8e',1,'fbgemm_gpu']]], + ['weight_10',['weight',['../namespacefbgemm__gpu.html#ab1426ad1956909abff1b26d04575767a',1,'fbgemm_gpu']]], + ['weight_5fdecay_11',['weight_decay',['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#a55c90ffc934511c5239912ee28729c08',1,'gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['weight_5fdecay_5fmode_12',['weight_decay_mode',['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#a29382d4f16f27e176ace0d7a6c14bedd',1,'gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['weight_5foffset_13',['WEIGHT_OFFSET',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a1e90593b9eb03be49ddd5e3e5473f0b5',1,'WEIGHT_OFFSET: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a1e90593b9eb03be49ddd5e3e5473f0b5',1,'WEIGHT_OFFSET: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a1e90593b9eb03be49ddd5e3e5473f0b5',1,'WEIGHT_OFFSET: embedding_forward_split_kernel_v2_template.cu']]], + ['weighted_5fsum_14',['weighted_sum',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#aa7e031196d379ec4120ba58cd6b48024',1,'fbgemm_gpu::Vec4StepT< STEP, float >::weighted_sum()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#aa7e031196d379ec4120ba58cd6b48024',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::weighted_sum()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#aa7e031196d379ec4120ba58cd6b48024',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::weighted_sum()']]], + ['weightrow_15',['WeightRow',['../structfbgemm__gpu_1_1_weight_row.html',1,'WeightRow< emb_t, cache_t, dst_t >'],['../structfbgemm__gpu_1_1_weight_row.html#acb13973152d6d76389dafdf6e69e6793',1,'fbgemm_gpu::WeightRow::WeightRow()']]], + ['weights_16',['weights',['../structinternal_1_1_hyper_compressed_sparse_column.html#a210dc23584593727ddf26671264aa16a',1,'internal::HyperCompressedSparseColumn::weights'],['../namespacefbgemm__gpu.html#adc8829ea4c8f30f6aaef00680ba3754a',1,'fbgemm_gpu::weights']]], + ['weights_5fdata_17',['weights_data',['../namespacefbgemm__gpu.html#a1148e12a9142c43e97064ffe24a0aa63',1,'fbgemm_gpu']]], + ['weights_5foffsets_18',['weights_offsets',['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html#a764f8ae801cd000c2a5cb4bb23f14299',1,'weights_offsets: gen_batch_index_select_dim0_forward_kernel_small.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html#a8952c1fa3b8169bec4e9aa6f07ce2271',1,'weights_offsets: gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#a9f15527d585dd62a23511c2f0bad4ca7',1,'weights_offsets: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html#a7d1fc13d818566d961fdf0fd44612dbb',1,'weights_offsets: gen_embedding_forward_split_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a9af84081fe94d1658365400ffcb263bc',1,'weights_offsets: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#a9f15527d585dd62a23511c2f0bad4ca7',1,'weights_offsets: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#aba904c170660e349edfb178490ec1ec1',1,'weights_offsets: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aba904c170660e349edfb178490ec1ec1',1,'weights_offsets: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#aba904c170660e349edfb178490ec1ec1',1,'weights_offsets: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#a68c2c016f330babab668514e78cb3bf1',1,'weights_offsets: gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['weights_5fplacements_19',['weights_placements',['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#ad4dd9cc51f1eccdf4626318632701868',1,'weights_placements: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html#a9a841e0386a10dcd6aa2fce96a7880b8',1,'weights_placements: gen_embedding_forward_split_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a5aa4ec0a3620e915289c174bc6ce3108',1,'weights_placements: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#af6ffac73b54018941c14b57180e69abd',1,'weights_placements: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#a5aa4ec0a3620e915289c174bc6ce3108',1,'weights_placements: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a5aa4ec0a3620e915289c174bc6ce3108',1,'weights_placements: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#a5aa4ec0a3620e915289c174bc6ce3108',1,'weights_placements: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#a9987071f2ac942c5d6c47d628b971738',1,'weights_placements: gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['while_20',['while',['../namespacefbgemm__gpu.html#a44128eca539acfe55bdf792616e8b5b6',1,'fbgemm_gpu']]], + ['write_5floop_5fsmall_5fls_21',['write_loop_small_Ls',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a784fce39007138a17294839676673bde',1,'write_loop_small_Ls(long *const smem, uint32_t *const write_idx, uint32_t *const bag_boundary, int32_t *const next_boundary, uint32_t *const L, Vec4StepT< STEP, emb_t > *const accumulator, const uint32_t params_offset, const uint32_t l, const bool process_d, const bool mean_pooling): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a784fce39007138a17294839676673bde',1,'write_loop_small_Ls(long *const smem, uint32_t *const write_idx, uint32_t *const bag_boundary, int32_t *const next_boundary, uint32_t *const L, Vec4StepT< STEP, emb_t > *const accumulator, const uint32_t params_offset, const uint32_t l, const bool process_d, const bool mean_pooling): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a784fce39007138a17294839676673bde',1,'write_loop_small_Ls(long *const smem, uint32_t *const write_idx, uint32_t *const bag_boundary, int32_t *const next_boundary, uint32_t *const L, Vec4StepT< STEP, emb_t > *const accumulator, const uint32_t params_offset, const uint32_t l, const bool process_d, const bool mean_pooling): embedding_forward_split_kernel_v2_template.cu']]] +]; diff --git a/search/all_18.js b/search/all_18.js new file mode 100644 index 000000000..e2d8da4f7 --- /dev/null +++ b/search/all_18.js @@ -0,0 +1,5 @@ +var searchData= +[ + ['x_0',['X',['../gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu'],['../embedding__forward__quantized__split__nbit__host__template_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: embedding_forward_quantized_split_nbit_host_template.cu'],['../embedding__forward__quantized__split__nbit__host__template_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: embedding_forward_quantized_split_nbit_host_template.cu'],['../embedding__forward__quantized__split__nbit__host__template_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: embedding_forward_quantized_split_nbit_host_template.cu'],['../embedding__forward__quantized__split__nbit__host__template_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: embedding_forward_quantized_split_nbit_host_template.cu'],['../embedding__forward__quantized__split__nbit__host__template_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: embedding_forward_quantized_split_nbit_host_template.cu'],['../embedding__forward__quantized__split__nbit__host__template_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: embedding_forward_quantized_split_nbit_host_template.cu']]], + ['xor128_1',['xor128',['../structfbgemm__gpu_1_1rk__state.html#a257f1349dcd98722e373947808b773c6',1,'fbgemm_gpu::rk_state']]] +]; diff --git a/search/all_19.js b/search/all_19.js new file mode 100644 index 000000000..ae5adc93e --- /dev/null +++ b/search/all_19.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['y_0',['Y',['../gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html#acec51faeb0681c58de451cb9d59abe95',1,'Y: gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html#acec51faeb0681c58de451cb9d59abe95',1,'Y: gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html#acec51faeb0681c58de451cb9d59abe95',1,'Y: gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu'],['../embedding__forward__quantized__split__nbit__host__template_8cu.html#acec51faeb0681c58de451cb9d59abe95',1,'Y: embedding_forward_quantized_split_nbit_host_template.cu']]] +]; diff --git a/search/all_1a.js b/search/all_1a.js new file mode 100644 index 000000000..6e78def8d --- /dev/null +++ b/search/all_1a.js @@ -0,0 +1,5 @@ +var searchData= +[ + ['zipf_5fcuda_0',['zipf_cuda',['../namespacefbgemm__gpu.html#a957e5dced6114b32a6d2e5e62011adbf',1,'fbgemm_gpu']]], + ['zipf_5fkernel_1',['zipf_kernel',['../namespacefbgemm__gpu.html#a6991817ca1213e7cc0eba3bad689c03a',1,'fbgemm_gpu']]] +]; diff --git a/search/all_1b.js b/search/all_1b.js new file mode 100644 index 000000000..28e447783 --- /dev/null +++ b/search/all_1b.js @@ -0,0 +1,5 @@ +var searchData= +[ + ['_7ehypercompressedsparsecolumn_0',['~HyperCompressedSparseColumn',['../structinternal_1_1_hyper_compressed_sparse_column.html#a60d5f8ac0716350bb51bcf02ed10aaeb',1,'internal::HyperCompressedSparseColumn']]], + ['_7einitializer_1',['~Initializer',['../classssd_1_1_initializer.html#a7a69aed99981539d9a2c0ee85459b4b6',1,'ssd::Initializer']]] +]; diff --git a/search/all_2.js b/search/all_2.js index 12377d589..a864338e2 100644 --- a/search/all_2.js +++ b/search/all_2.js @@ -1,7 +1,52 @@ var searchData= [ - ['data_20cpu_20operators_0',['data cpu operators',['../group__quantize-data-cpu.html',1,'Quantize Data CPU Operators'],['../group__sparse-data-cpu.html',1,'Sparse Data CPU Operators']]], - ['data_20cuda_20operators_1',['Sparse Data CUDA Operators',['../group__sparse-data-cuda.html',1,'']]], - ['direct_5fmapped_5flru_5fcache_5fpopulate_5fbyte_5fcuda_2',['direct_mapped_lru_cache_populate_byte_cuda',['../group__table-batched-embed-cuda.html#gae019b6879bd9f89a146e0700d5a4bd8b',1,'split_embeddings_cache_cuda.cuh']]], - ['direct_5fmapped_5flxu_5fcache_5flookup_5fcuda_3',['direct_mapped_lxu_cache_lookup_cuda',['../group__table-batched-embed-cuda.html#gab305ebdd3822794c5ac462bf5df4bb49',1,'split_embeddings_cache_cuda.cuh']]] + ['b_0',['b',['../structfbgemm__gpu_1_1_half4.html#a85c654c77d6c3fc7709e8dd1e7ec4a5e',1,'fbgemm_gpu::Half4']]], + ['b_1',['B',['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#a240b4e029c521f922d447346c8b757b8',1,'B: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#ad0a8e9e782f3a3f177d6791f9ee9b866',1,'B: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#a240b4e029c521f922d447346c8b757b8',1,'B: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#ad0a8e9e782f3a3f177d6791f9ee9b866',1,'B: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ad0a8e9e782f3a3f177d6791f9ee9b866',1,'B: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#ad0a8e9e782f3a3f177d6791f9ee9b866',1,'B: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../namespacefbgemm__gpu.html#adb51b4975da6fe6cd1f6465b56b3b8ab',1,'fbgemm_gpu::B']]], + ['b_2',['b',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a83bdb6901e840002ec04521cd2fcafe6',1,'b: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a83bdb6901e840002ec04521cd2fcafe6',1,'b: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../namespacefbgemm__gpu.html#ab540864a8f4d5cfb95d168df6ff1ac51',1,'fbgemm_gpu::b']]], + ['backward_3',['backward',['../classfbgemm__gpu_1_1_permute_pooled_embs_function.html#ac7ddba5222bfda33f8a498f8394349bf',1,'fbgemm_gpu::PermutePooledEmbsFunction::backward()'],['../classfbgemm__gpu_1_1_permute_pooled_embs_function_split.html#ad62a42e85be3aa7f972677a4f7b710f9',1,'fbgemm_gpu::PermutePooledEmbsFunctionSplit::backward()']]], + ['ballot_5fsync_4',['ballot_sync',['../namespacefbgemm__gpu.html#ac9ef3cbe68285c5559d30c5157131e29',1,'fbgemm_gpu']]], + ['batch_5fauc_5',['batch_auc',['../namespacefbgemm__gpu.html#abeeb6bd4d39a0e534db2213258704285',1,'fbgemm_gpu']]], + ['batch_5findex_5fselect_5fdim0_5fcodegen_5fbackward_5fcuda_6',['batch_index_select_dim0_codegen_backward_cuda',['../gen__batch__index__select__dim0__backward__codegen__cuda_8cu.html#a5709eebbefa399282269508003e47e25',1,'batch_index_select_dim0_codegen_backward_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const int64_t max_segment_length_per_warp, const Tensor &grad_offsets, const Tensor &total_L_offsets, const int32_t fixed_L_per_warp, const int32_t num_warps_per_feature, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_codegen_cuda.cu'],['../batch__index__select__dim0__host_8cpp.html#a5709eebbefa399282269508003e47e25',1,'batch_index_select_dim0_codegen_backward_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const int64_t max_segment_length_per_warp, const Tensor &grad_offsets, const Tensor &total_L_offsets, const int32_t fixed_L_per_warp, const int32_t num_warps_per_feature, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_codegen_cuda.cu']]], + ['batch_5findex_5fselect_5fdim0_5fcodegen_5fforward_5fcuda_7',['batch_index_select_dim0_codegen_forward_cuda',['../gen__batch__index__select__dim0__forward__codegen__cuda_8cu.html#a5951ed801e11a01c29c7bbfb648ee230',1,'batch_index_select_dim0_codegen_forward_cuda(const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const int64_t output_dtype, const Tensor &output_offsets, const Tensor &total_L_offsets, const int64_t output_size, const int32_t fixed_L_per_warp, const int32_t num_warps_per_feature, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_forward_codegen_cuda.cu'],['../batch__index__select__dim0__host_8cpp.html#a5951ed801e11a01c29c7bbfb648ee230',1,'batch_index_select_dim0_codegen_forward_cuda(const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const int64_t output_dtype, const Tensor &output_offsets, const Tensor &total_L_offsets, const int64_t output_size, const int32_t fixed_L_per_warp, const int32_t num_warps_per_feature, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_forward_codegen_cuda.cu']]], + ['batch_5findex_5fselect_5fdim0_5fcpu_8',['batch_index_select_dim0_cpu',['../batch__index__select__dim0__cpu__host_8cpp.html#aa719f2231fb791074324f6bbeace9d0c',1,'batch_index_select_dim0_cpu_host.cpp']]], + ['batch_5findex_5fselect_5fdim0_5fcpu_5fhost_2ecpp_9',['batch_index_select_dim0_cpu_host.cpp',['../batch__index__select__dim0__cpu__host_8cpp.html',1,'']]], + ['batch_5findex_5fselect_5fdim0_5fgpu_10',['batch_index_select_dim0_gpu',['../batch__index__select__dim0__host_8cpp.html#a5bad7a4ddb5cf6144ad19b6296ef585c',1,'batch_index_select_dim0_host.cpp']]], + ['batch_5findex_5fselect_5fdim0_5fhost_2ecpp_11',['batch_index_select_dim0_host.cpp',['../batch__index__select__dim0__host_8cpp.html',1,'']]], + ['batch_5fsize_12',['batch_size',['../namespacefbgemm__gpu.html#add6df347839b36aa580f997fddaebf86',1,'fbgemm_gpu']]], + ['batch_5fsize_5foffsets_13',['batch_size_offsets',['../namespacefbgemm__gpu.html#afba1f0bf46d421e1e2834949792290e0',1,'fbgemm_gpu']]], + ['batch_5fsize_5fper_5ffeature_14',['batch_size_per_feature',['../namespacefbgemm__gpu.html#a34cfcac7aff478aac7e03c48a25b0447',1,'fbgemm_gpu']]], + ['batched_5fdense_5fvec_5fjagged_5f2d_5fmul_15',['batched_dense_vec_jagged_2d_mul',['../group__jagged-tensor-ops-cpu.html#ga67afdd148d57be07278c9cb088b5ff4b',1,'fbgemm_gpu']]], + ['batched_5fdense_5fvec_5fjagged_5f2d_5fmul_5fbackward_16',['batched_dense_vec_jagged_2d_mul_backward',['../namespacefbgemm__gpu.html#ae815e5156f29e106f0fcb6054d386afa',1,'fbgemm_gpu']]], + ['batched_5fdense_5fvec_5fjagged_5f2d_5fmul_5fbackward_2ecu_17',['batched_dense_vec_jagged_2d_mul_backward.cu',['../batched__dense__vec__jagged__2d__mul__backward_8cu.html',1,'']]], + ['batched_5fdense_5fvec_5fjagged_5f2d_5fmul_5fbackward_5fmeta_18',['batched_dense_vec_jagged_2d_mul_backward_meta',['../namespacefbgemm__gpu.html#af5324c97be6dc5aecbc40e4e3244646f',1,'fbgemm_gpu']]], + ['batched_5fdense_5fvec_5fjagged_5f2d_5fmul_5fforward_19',['batched_dense_vec_jagged_2d_mul_forward',['../namespacefbgemm__gpu.html#ac3080e0008d5cdd9f1f32b33e38aee95',1,'fbgemm_gpu']]], + ['batched_5fdense_5fvec_5fjagged_5f2d_5fmul_5fforward_2ecu_20',['batched_dense_vec_jagged_2d_mul_forward.cu',['../batched__dense__vec__jagged__2d__mul__forward_8cu.html',1,'']]], + ['batched_5fdense_5fvec_5fjagged_5f2d_5fmul_5fforward_5fmeta_21',['batched_dense_vec_jagged_2d_mul_forward_meta',['../namespacefbgemm__gpu.html#a399af8be70030a7aeaedbdf546efe61a',1,'fbgemm_gpu']]], + ['batched_5funary_5fembeddings_5fbackward_5fcuda_22',['batched_unary_embeddings_backward_cuda',['../namespacefbgemm__gpu.html#a0e4965515624f44fcd114ff1e5ff0998',1,'fbgemm_gpu']]], + ['batched_5funary_5fembeddings_5fforward_5fcpu_23',['batched_unary_embeddings_forward_cpu',['../namespacefbgemm__gpu.html#a96db75aa5b2617976c2937ab051b737e',1,'fbgemm_gpu']]], + ['batched_5funary_5fembeddings_5fforward_5fcuda_24',['batched_unary_embeddings_forward_cuda',['../namespacefbgemm__gpu.html#a9895cf76445e7258f2464bb037d2c54c',1,'fbgemm_gpu']]], + ['bench_5futils_2ecuh_25',['bench_utils.cuh',['../bench__utils_8cuh.html',1,'']]], + ['benchmark_5ffunction_26',['benchmark_function',['../bench__utils_8cuh.html#a8b8729bf92a232e1ff3403ebe7089fdd',1,'bench_utils.cuh']]], + ['bf_27',['BF',['../namespacefbgemm__gpu.html#aa7e45742197542f659233c21b883ba60a7b8d2f92148f52cad46e331936922e80',1,'fbgemm_gpu']]], + ['bf16_28',['BF16',['../namespacefbgemm__gpu.html#a47b4476e5f749d63e15d2f8e55be833eaf656bbf613964dcf710b771b0918ab30',1,'fbgemm_gpu']]], + ['bfloat16quantizedtofloat_5fref_29',['BFloat16QuantizedToFloat_ref',['../namespacefbgemm__gpu.html#a0f1d1afe56f116552e1ca9759e6e0fcc',1,'fbgemm_gpu']]], + ['bin_5fboundaries_30',['bin_boundaries',['../namespacefbgemm__gpu.html#a7d3b870a22caa3968ca55fb89420e970',1,'fbgemm_gpu']]], + ['bin_5fctr_5fin_5fuse_5fafter_31',['bin_ctr_in_use_after',['../namespacefbgemm__gpu.html#a5306cfe92409d5d6525baade1714a78a',1,'fbgemm_gpu']]], + ['bin_5fctr_5fweight_5fvalue_32',['bin_ctr_weight_value',['../namespacefbgemm__gpu.html#a505eb55e26cb1a63decb22880c93b9fd',1,'fbgemm_gpu']]], + ['bin_5fids_5fdata_33',['bin_ids_data',['../namespacefbgemm__gpu.html#a24c7d1d72baa0efece963a4ed4db9c17',1,'fbgemm_gpu']]], + ['bin_5fnum_5fexamples_5fdata_34',['bin_num_examples_data',['../namespacefbgemm__gpu.html#ad09ae93c92bfe0fe061460cfe4acd611',1,'fbgemm_gpu']]], + ['bin_5fnum_5fpositives_5fdata_35',['bin_num_positives_data',['../namespacefbgemm__gpu.html#a6cf3109a8de0f8ef7a818474a2fec845',1,'fbgemm_gpu']]], + ['binary_5fsearch_5frange_36',['binary_search_range',['../namespacefbgemm__gpu.html#a13b4df4139f3c64ac4d8dbea51a7e7a0',1,'fbgemm_gpu']]], + ['binary_5fsearch_5frange_5fcpu_37',['binary_search_range_cpu',['../sparse__ops__utils_8h.html#a519154f3b89148b1b70e45d8c340ff81',1,'sparse_ops_utils.h']]], + ['bitonicsort_38',['BitonicSort',['../structfbgemm__gpu_1_1_bitonic_sort.html',1,'fbgemm_gpu']]], + ['block_5fbucketize_5fpos_5fconcat_39',['block_bucketize_pos_concat',['../namespacefbgemm__gpu.html#acc943f4a5b9448babdf4b36ff9095dff',1,'fbgemm_gpu']]], + ['block_5fbucketize_5fpos_5foffsets_40',['block_bucketize_pos_offsets',['../namespacefbgemm__gpu.html#a7caa87d119b6ee26ae8fe2b66671215c',1,'fbgemm_gpu']]], + ['block_5fbucketize_5fsparse_5ffeatures_5fcpu_41',['block_bucketize_sparse_features_cpu',['../namespacefbgemm__gpu.html#a270e4d8df103fa6c3e6750890608b566',1,'fbgemm_gpu']]], + ['block_5fbucketize_5fsparse_5ffeatures_5fcuda_42',['block_bucketize_sparse_features_cuda',['../namespacefbgemm__gpu.html#a293dc249ac4679d97747778a7fb02bd5',1,'fbgemm_gpu']]], + ['block_5fsizes_5fdata_43',['block_sizes_data',['../namespacefbgemm__gpu.html#ab2cdb48bca4ebe95f2cdeedea09f549f',1,'fbgemm_gpu']]], + ['bounds_5fcheck_5f_44',['bounds_check_',['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#a0e958eecb22f175be483bef10d6e2597',1,'fbgemm_gpu::GenericPackedTensorAccessorBase']]], + ['bounds_5fcheck_5findices_5fcuda_45',['bounds_check_indices_cuda',['../group__embedding-cuda.html#gad1c20ea2ace30c269811890919ebdb6e',1,'bounds_check_indices_cuda(Tensor &rows_per_table, Tensor &indices, Tensor &offsets, int64_t bounds_check_mode_, Tensor &warning, const c10::optional< Tensor > &weights, const c10::optional< Tensor > &B_offsets, const int64_t max_B): embedding_bounds_check.cu'],['../group__embedding-cuda.html#gad1c20ea2ace30c269811890919ebdb6e',1,'bounds_check_indices_cuda(Tensor &rows_per_table, Tensor &indices, Tensor &offsets, int64_t bounds_check_mode, Tensor &warning, const c10::optional< Tensor > &weights, const c10::optional< Tensor > &B_ofsets, const int64_t max_B): embedding_bounds_check.cu']]], + ['boundscheckmode_46',['BoundsCheckMode',['../namespacefbgemm__gpu.html#a70433200cf584e2429434a33d45111ea',1,'fbgemm_gpu']]], + ['bucketize_5fsparse_5ffeatures_5fcpu_47',['bucketize_sparse_features_cpu',['../namespacefbgemm__gpu.html#a83c70249ce058969210bda8aedf671a4',1,'fbgemm_gpu']]], + ['bucketize_5fsparse_5ffeatures_5fcuda_48',['bucketize_sparse_features_cuda',['../namespacefbgemm__gpu.html#abb94f2bd00f8ee054a4a1d2417a093d1',1,'fbgemm_gpu']]] ]; diff --git a/search/all_3.js b/search/all_3.js index e39df76fa..4f7f59675 100644 --- a/search/all_3.js +++ b/search/all_3.js @@ -1,6 +1,65 @@ var searchData= [ - ['embedding_20cpu_20operators_0',['Embedding CPU Operators',['../group__embedding-cpu.html',1,'']]], - ['embedding_20cuda_20operators_1',['Embedding CUDA Operators',['../group__embedding-cuda.html',1,'']]], - ['expand_5finto_5fjagged_5fpermute_5fcuda_2',['expand_into_jagged_permute_cuda',['../group__sparse-data-cuda.html#ga2402de1c0102b21af5f2bd5a50d30309',1,'fbgemm_gpu']]] + ['c_5fversion_0',['C_VERSION',['../_c_make_c_compiler_id_8c.html#adaee3ee7c5a7a22451ea25e762e1d7d5',1,'CMakeCCompilerId.c']]], + ['cache_5frow_5f_1',['cache_row_',['../structfbgemm__gpu_1_1_weight_row.html#a8ba350d1da8749a0975ab4c1f645de70',1,'fbgemm_gpu::WeightRow']]], + ['cache_5fvec_5ft_2',['cache_vec_t',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a6394626e129b23b47a8e900179ea1a98',1,'cache_vec_t: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a6394626e129b23b47a8e900179ea1a98',1,'cache_vec_t: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['calc_5foffsets_5frange_5fthread_5fblock_3',['calc_offsets_range_thread_block',['../namespacefbgemm__gpu.html#ae0656dd690bcffdd8b470d894e25b2d8',1,'fbgemm_gpu']]], + ['calibrated_5fprediction_5fdata_4',['calibrated_prediction_data',['../namespacefbgemm__gpu.html#a5a04eca282d6278fd065294a91065404',1,'fbgemm_gpu']]], + ['cat_5freorder_5fbatched_5fad_5findices_5fcpu_5',['cat_reorder_batched_ad_indices_cpu',['../namespacefbgemm__gpu.html#a1ed236113fa360c41a2eb0507c3fc2c7',1,'fbgemm_gpu']]], + ['cat_5freorder_5fbatched_5fad_5findices_5fcpu_5f_6',['cat_reorder_batched_ad_indices_cpu_',['../namespacefbgemm__gpu.html#a6b5e65a3f532db97f093037c9dcb3902',1,'fbgemm_gpu']]], + ['cmakeccompilerid_2ec_7',['CMakeCCompilerId.c',['../_c_make_c_compiler_id_8c.html',1,'']]], + ['cmakecxxcompilerid_2ecpp_8',['CMakeCXXCompilerId.cpp',['../_c_make_c_x_x_compiler_id_8cpp.html',1,'']]], + ['column_5fsegment_5fids_9',['column_segment_ids',['../structinternal_1_1_hyper_compressed_sparse_column.html#a1e60e73bdb48b0daa00b9f6caa8c6728',1,'internal::HyperCompressedSparseColumn']]], + ['column_5fsegment_5findices_10',['column_segment_indices',['../structinternal_1_1_hyper_compressed_sparse_column.html#ad90d05e46d82122e7688be758b7cb43a',1,'internal::HyperCompressedSparseColumn']]], + ['column_5fsegment_5fptr_11',['column_segment_ptr',['../structinternal_1_1_hyper_compressed_sparse_column.html#ad1d5cb09cff5c55cbb74931bc58d8080',1,'internal::HyperCompressedSparseColumn']]], + ['combine_20input_20operators_12',['Combine Input Operators',['../group__input-combine.html',1,'']]], + ['combined_5flengths_13',['combined_lengths',['../namespacefbgemm__gpu.html#a176c2b8769558803ba0614bc04b7995f',1,'fbgemm_gpu']]], + ['combined_5fweights_14',['combined_weights',['../namespacefbgemm__gpu.html#a426e281c9c2dd29c0abe399f17ba8d6f',1,'fbgemm_gpu']]], + ['common_2ecuh_15',['common.cuh',['../jagged__tensor__ops_2common_8cuh.html',1,'(Global Namespace)'],['../memory__utils_2common_8cuh.html',1,'(Global Namespace)'],['../quantize__ops_2common_8cuh.html',1,'(Global Namespace)'],['../sparse__ops_2common_8cuh.html',1,'(Global Namespace)'],['../split__embeddings__cache_2common_8cuh.html',1,'(Global Namespace)']]], + ['common_2eh_16',['common.h',['../memory__utils_2common_8h.html',1,'(Global Namespace)'],['../split__embeddings__cache_2common_8h.html',1,'(Global Namespace)']]], + ['compact_17',['compact',['../classssd_1_1_embedding_rocks_d_b.html#a043cdfc194924194e381a986c229569e',1,'ssd::EmbeddingRocksDB']]], + ['compact_5fif_5fnecessary_18',['compact_if_necessary',['../classssd_1_1_embedding_rocks_d_b.html#a92b07dcd61720ad3a72dbbad89c26514',1,'ssd::EmbeddingRocksDB']]], + ['comparator_19',['Comparator',['../structfbgemm__gpu_1_1_comparator.html',1,'fbgemm_gpu']]], + ['compiler_5fid_20',['COMPILER_ID',['../_c_make_c_compiler_id_8c.html#a81dee0709ded976b2e0319239f72d174',1,'COMPILER_ID: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#a81dee0709ded976b2e0319239f72d174',1,'COMPILER_ID: CMakeCXXCompilerId.cpp']]], + ['compute_5ffrequency_5fsequence_21',['compute_frequency_sequence',['../namespacefbgemm__gpu.html#a6b41d7b032eb1abe61eee0bd903d8dfb',1,'fbgemm_gpu']]], + ['compute_5fnum_5fuint64s_22',['compute_num_uint64s',['../namespacefbgemm__gpu.html#af861e4a8f7b669619744fe59ca2f73a3',1,'fbgemm_gpu']]], + ['consumer_5fqueue_5f_23',['consumer_queue_',['../classssd_1_1_initializer.html#a794bafa095540403ada855b817d1d367',1,'ssd::Initializer']]], + ['convert_5ffloat_5fto_5fhalf_5fassemblefloat_24',['convert_float_to_half_assemblefloat',['../verify__fp16__stochastic__benchmark_8cu.html#abbb1b78a4249b42b116429258ac56174',1,'verify_fp16_stochastic_benchmark.cu']]], + ['convert_5ffloat_5fto_5fhalf_5fbitcarry_25',['convert_float_to_half_bitcarry',['../verify__fp16__stochastic__benchmark_8cu.html#a46898a808f7408d99e7ad4c7fc0fea2a',1,'verify_fp16_stochastic_benchmark.cu']]], + ['convert_5ffloat_5fto_5fhalf_5fdirect_26',['convert_float_to_half_direct',['../verify__fp16__stochastic__benchmark_8cu.html#a169a7087c41e8efae2d09cfc78fa802e',1,'verify_fp16_stochastic_benchmark.cu']]], + ['convert_5ffloat_5fto_5fhalf_5fshortrand_27',['convert_float_to_half_shortrand',['../verify__fp16__stochastic__benchmark_8cu.html#ab109332ca0fae3f39a7d000348a1401c',1,'verify_fp16_stochastic_benchmark.cu']]], + ['copy_28',['copy',['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#aa3322732b0a44cf924b89a066f4503d4',1,'fbgemm_gpu::Vec4T< float >::copy()'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#ad6a7665bbc9596b7b9123c9a0605fe1c',1,'fbgemm_gpu::Vec4T< at::Half >::copy()'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a140a9bcb80dcfae69a427d885d148952',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::copy()'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a8af22674533453883301576ae485699c',1,'fbgemm_gpu::Vec4T< double >::copy()']]], + ['copy_5fstr_29',['copy_str',['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#a7c56e8e49eb26679b9cf3a65c3bd38a9',1,'fbgemm_gpu::GenericPackedTensorAccessorBase']]], + ['cp_5fasync_5ffence_30',['cp_async_fence',['../namespacenbit.html#a9d3f5c31c0728bd8031522979f9fd236',1,'nbit']]], + ['cp_5fasync_5fwait_31',['cp_async_wait',['../namespacenbit.html#ab71806d51c0bb2fbc0b08fb3ed2b442e',1,'nbit']]], + ['cp_5fasync_5fwait_3c_200_20_3e_32',['cp_async_wait< 0 >',['../namespacenbit.html#a869b22b83f81fa2ed2302ceb80d9b9ca',1,'nbit']]], + ['cp_5fasync_5fzfill_33',['cp_async_zfill',['../namespacenbit.html#ac46112b67b5de646034bc1d35d44c8fe',1,'nbit']]], + ['cp_5fasync_5fzfill_5fcg_34',['cp_async_zfill_cg',['../namespacenbit.html#a7f38bc64db06ad5e5ee1b4efa55c349d',1,'nbit']]], + ['cpu_35',['Permute Pooled Embeddings Operators (CPU)',['../group__permute-pooled-embs-cpu.html',1,'']]], + ['cpu_20operators_36',['CPU Operators',['../group__embedding-cpu.html',1,'Embedding CPU Operators'],['../group__layout-transform-cpu.html',1,'Layout Transformation CPU Operators'],['../group__quantize-data-cpu.html',1,'Quantize Data CPU Operators'],['../group__sparse-data-cpu.html',1,'Sparse Data CPU Operators']]], + ['cpu_5fkernel_5ftest_2ecpp_37',['cpu_kernel_test.cpp',['../cpu__kernel__test_8cpp.html',1,'']]], + ['cpu_5futils_2eh_38',['cpu_utils.h',['../cpu__utils_8h.html',1,'']]], + ['csr2csc_39',['csr2csc',['../namespaceinternal.html#adff2ce52cb6a5e84b57614a452aa77d5',1,'internal']]], + ['csr2csc_3c_20double_20_3e_40',['csr2csc< double >',['../namespaceinternal.html#ab8f896e4d2c97b1369a8e5fb7d9408b7',1,'internal']]], + ['csr2csc_3c_20float_20_3e_41',['csr2csc< float >',['../namespaceinternal.html#a3715c6c222855aa1b842c358fe2a6420',1,'internal']]], + ['csr_5fseg_5fdata_42',['csr_seg_data',['../namespacefbgemm__gpu.html#a0523b0079ced4e8a092ec1f3e5b5a193',1,'fbgemm_gpu']]], + ['cub_5fnamespace_5fpostfix_2ecuh_43',['cub_namespace_postfix.cuh',['../cub__namespace__postfix_8cuh.html',1,'']]], + ['cub_5fnamespace_5fprefix_2ecuh_44',['cub_namespace_prefix.cuh',['../cub__namespace__prefix_8cuh.html',1,'']]], + ['cuda_45',['CUDA',['../group__permute-pooled-embs-gpu.html',1,'Permute Pooled Embeddings Operators (CUDA)'],['../group__quantize-ops-cuda.html',1,'Quantization Operators (CUDA)']]], + ['cuda_20memory_20operators_46',['CUDA Memory Operators',['../group__cumem-utils.html',1,'']]], + ['cuda_20operators_47',['CUDA Operators',['../group__table-batched-embed-cuda.html',1,'CUDA Operators'],['../group__embedding-cuda.html',1,'Embedding CUDA Operators'],['../group__jagged-tensor-ops-cuda.html',1,'Jagged Tensor CUDA Operators'],['../group__layout-transform-cuda.html',1,'Layout Transformation CUDA Operators'],['../group__sparse-data-cuda.html',1,'Sparse Data CUDA Operators']]], + ['cuda_5fcalc_5fblock_5fcount_48',['cuda_calc_block_count',['../sparse__ops__utils_8h.html#ab702f2479ba0bedf91c18e0b644b210a',1,'sparse_ops_utils.h']]], + ['cuda_5fcalc_5fxblock_5fcount_49',['cuda_calc_xblock_count',['../sparse__ops__utils_8h.html#a2eba06f69b5b34fe6ca0eafb0240d369',1,'sparse_ops_utils.h']]], + ['cuda_5fcalc_5fxblock_5fcount_5fbase_50',['cuda_calc_xblock_count_base',['../sparse__ops__utils_8h.html#a885f787cafec301665604303ae43a2e3',1,'sparse_ops_utils.h']]], + ['cuda_5fcheck_51',['CUDA_CHECK',['../cuda__utils_8cuh.html#ad64d49299c3d240ae540a693ae38ca38',1,'cuda_utils.cuh']]], + ['cuda_5fdevice_5f_52',['cuda_device_',['../memory__utils_8cu.html#a96208d96b413317e110ff94d64c71ef4',1,'memory_utils.cu']]], + ['cuda_5fkernel_5floop_53',['CUDA_KERNEL_LOOP',['../namespacefbgemm__gpu.html#a14c0f0b2b6107f2b17eb472d9be9fb03',1,'fbgemm_gpu::CUDA_KERNEL_LOOP(b_t, lengths_size)'],['../namespacefbgemm__gpu.html#ab331d23c5119efeb513b36fed74c53b0',1,'fbgemm_gpu::CUDA_KERNEL_LOOP(r, lengths_size)']]], + ['cuda_5futils_2ecuh_54',['cuda_utils.cuh',['../cuda__utils_8cuh.html',1,'']]], + ['cumem_5futils_2eh_55',['cumem_utils.h',['../cumem__utils_8h.html',1,'']]], + ['curr_5fbin_5fid_56',['curr_bin_id',['../namespacefbgemm__gpu.html#aa80c593013706e17927a0cedd1d6dbb0',1,'fbgemm_gpu']]], + ['curr_5fbin_5fnum_5fexamples_57',['curr_bin_num_examples',['../namespacefbgemm__gpu.html#afce91df3fd14c65d1d464b891004b1da',1,'fbgemm_gpu']]], + ['curr_5foffset_58',['curr_offset',['../namespacefbgemm__gpu.html#a5774000010ec731b390787b3b5f72868',1,'fbgemm_gpu']]], + ['curr_5fsegment_5fvalue_59',['curr_segment_value',['../namespacefbgemm__gpu.html#a216663a22f5311b9ecf7c9bc64ee047d',1,'fbgemm_gpu']]], + ['cutlass_5fget_5fsmem_5fpointer_60',['cutlass_get_smem_pointer',['../namespacenbit.html#a64cf76bab7c5be6cb2b0c7d1b77443a5',1,'nbit::cutlass_get_smem_pointer(void *ptr)'],['../namespacenbit.html#a250008d643379010295dede0b64068c6',1,'nbit::cutlass_get_smem_pointer(void const *ptr)']]], + ['cxx_5fstd_61',['CXX_STD',['../_c_make_c_x_x_compiler_id_8cpp.html#a34cc889e576a1ae6c84ae9e0a851ba21',1,'CMakeCXXCompilerId.cpp']]] ]; diff --git a/search/all_4.js b/search/all_4.js index 7e14db79a..39a5f8f39 100644 --- a/search/all_4.js +++ b/search/all_4.js @@ -1,4 +1,69 @@ var searchData= [ - ['for_20cuda_0',['Quantization Operators for CUDA',['../group__quantize-ops-cuda.html',1,'']]] + ['d_0',['D',['../classfbgemm__gpu_1_1_fixed_divisor.html#aa0904583fc7c962f6ae008052d6dadf7',1,'fbgemm_gpu::FixedDivisor::D()'],['../gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html#a057f5488fcdaf454d09c4f1b25374ac9',1,'D: gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html#a952bac18af6443873547ada58c1e9f82',1,'D: gen_embedding_forward_split_unweighted_nobag_kernel_small.cu']]], + ['d_5fflush2_1',['d_flush2',['../bench__utils_8cuh.html#a33347a1447f1a3618e698f9d5914c253',1,'bench_utils.cuh']]], + ['d_5foffsets_2',['D_offsets',['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html#a8a3ac708f5fc38ea5ebecdbe685f3c73',1,'D_offsets: gen_batch_index_select_dim0_forward_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#a723eb6856253bb4551265a356dd5f35d',1,'D_offsets: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a1e9016830b84a13779c14bb73acce5b1',1,'D_offsets: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#a723eb6856253bb4551265a356dd5f35d',1,'D_offsets: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#a1cf44edc754c1d53c702015bfb974d77',1,'D_offsets: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a1cf44edc754c1d53c702015bfb974d77',1,'D_offsets: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#a1cf44edc754c1d53c702015bfb974d77',1,'D_offsets: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu']]], + ['d_5fstart_3',['D_start',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aede9588b11147ebb6a17d9672563737c',1,'D_start: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aede9588b11147ebb6a17d9672563737c',1,'D_start: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['data_4',['data',['../classfbgemm__gpu_1_1_tensor_accessor_base.html#a00eb43c6e0e2f9b3a5d083cf44bad46c',1,'fbgemm_gpu::TensorAccessorBase::data()'],['../classfbgemm__gpu_1_1_tensor_accessor_base.html#a445a0aad25aa4b10485392cab109a77b',1,'fbgemm_gpu::TensorAccessorBase::data() const'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#a00eb43c6e0e2f9b3a5d083cf44bad46c',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::data()'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#a445a0aad25aa4b10485392cab109a77b',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::data() const'],['../jagged__tensor__ops_2common_8cuh.html#a4f36f56fa6a995a4ad013e16ba311b31',1,'data: common.cuh']]], + ['data_20cpu_20operators_5',['Data CPU Operators',['../group__quantize-data-cpu.html',1,'Quantize Data CPU Operators'],['../group__sparse-data-cpu.html',1,'Sparse Data CPU Operators']]], + ['data_20cuda_20operators_6',['Sparse Data CUDA Operators',['../group__sparse-data-cuda.html',1,'']]], + ['data_5f_7',['data_',['../classfbgemm__gpu_1_1_tensor_accessor_base.html#a677c54e91f2222170a12252fe509d069',1,'fbgemm_gpu::TensorAccessorBase::data_'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#a677c54e91f2222170a12252fe509d069',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::data_']]], + ['db_5fshard_8',['db_shard',['../namespacessd.html#ac0918c17a5ef4ae94a7d4068512744f9',1,'ssd']]], + ['dec_9',['DEC',['../_c_make_c_compiler_id_8c.html#ad1280362da42492bbc11aa78cbf776ad',1,'DEC: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#ad1280362da42492bbc11aa78cbf776ad',1,'DEC: CMakeCXXCompilerId.cpp']]], + ['decl_5fradix_5fsort_5fpairs_5ffn_10',['DECL_RADIX_SORT_PAIRS_FN',['../split__embeddings__utils_8cuh.html#a91fe9e10ff5c98fe4952c9c0986476b4',1,'DECL_RADIX_SORT_PAIRS_FN: split_embeddings_utils.cuh'],['../split__embeddings__utils_8cuh.html#a07c7c57b2dd34f8dcede30593003253c',1,'DECL_RADIX_SORT_PAIRS_FN(int64_t, float): split_embeddings_utils.cuh'],['../split__embeddings__utils_8cuh.html#a665ecb055cdda875801b442d35297e10',1,'DECL_RADIX_SORT_PAIRS_FN(int64_t, double): split_embeddings_utils.cuh'],['../split__embeddings__utils_8cuh.html#a68379ca489210e052be87595ff7c1ec7',1,'DECL_RADIX_SORT_PAIRS_FN(int64_t, int64_t): split_embeddings_utils.cuh'],['../split__embeddings__utils_8cuh.html#a94564bf3eeebee1b64b0fe3ba0b3b7e0',1,'DECL_RADIX_SORT_PAIRS_FN(int64_t, int32_t): split_embeddings_utils.cuh']]], + ['def_5fradix_5fsort_5fpairs_5ffn_11',['DEF_RADIX_SORT_PAIRS_FN',['../radix__sort__pairs_8cu.html#a4cf2c787c9111fdc77b98fcc9e690344',1,'DEF_RADIX_SORT_PAIRS_FN: radix_sort_pairs.cu'],['../radix__sort__pairs_8cu.html#aca8b050260de3f4f24d6bb405cbbdd85',1,'DEF_RADIX_SORT_PAIRS_FN(int64_t, float): radix_sort_pairs.cu'],['../radix__sort__pairs_8cu.html#a8ff9c3ca029c1596694941f07c7b2dc4',1,'DEF_RADIX_SORT_PAIRS_FN(int64_t, double): radix_sort_pairs.cu'],['../radix__sort__pairs_8cu.html#a932f303789b405fceb31dd0f40f10d43',1,'DEF_RADIX_SORT_PAIRS_FN(int64_t, int64_t): radix_sort_pairs.cu'],['../radix__sort__pairs_8cu.html#ac3e8e7f0d44c6e7d4a5aea790dca2526',1,'DEF_RADIX_SORT_PAIRS_FN(int64_t, int32_t): radix_sort_pairs.cu']]], + ['default_5finfo_5fb_5fmask_12',['DEFAULT_INFO_B_MASK',['../split__embeddings__utils_8cuh.html#a312a32dcc1f3a4980ed4c458b8bab67f',1,'split_embeddings_utils.cuh']]], + ['default_5finfo_5fb_5fnum_5fbits_13',['DEFAULT_INFO_B_NUM_BITS',['../split__embeddings__utils_8cuh.html#ac9d136da765bb4871acd477da0f2c254',1,'split_embeddings_utils.cuh']]], + ['default_5finfo_5fnum_5fbits_14',['DEFAULT_INFO_NUM_BITS',['../split__embeddings__utils_8cuh.html#a27002d5a8e75578957e448377c440dbd',1,'split_embeddings_utils.cuh']]], + ['defaultptrtraits_15',['DefaultPtrTraits',['../structfbgemm__gpu_1_1_default_ptr_traits.html',1,'fbgemm_gpu']]], + ['dense_5fembedding_5fcodegen_5fforward_5funweighted_5fcuda_16',['dense_embedding_codegen_forward_unweighted_cuda',['../gen__embedding__forward__dense__unweighted__codegen__cuda_8cu.html#a840483d38dd0ee3fe4b398ebee5bf3d7',1,'dense_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_dense_unweighted_codegen_cuda.cu'],['../embedding__backward__dense__host_8cpp.html#a840483d38dd0ee3fe4b398ebee5bf3d7',1,'dense_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_dense_unweighted_codegen_cuda.cu']]], + ['dense_5fembedding_5fcodegen_5fforward_5funweighted_5fmeta_17',['dense_embedding_codegen_forward_unweighted_meta',['../gen__embedding__forward__dense__unweighted__codegen__meta_8cpp.html#ac9e6ce9ed24a999160137cd295420a9f',1,'gen_embedding_forward_dense_unweighted_codegen_meta.cpp']]], + ['dense_5fembedding_5fcodegen_5fforward_5fweighted_5fcuda_18',['dense_embedding_codegen_forward_weighted_cuda',['../gen__embedding__forward__dense__weighted__codegen__cuda_8cu.html#a4e4e521f171d17c5d78bee2b3c9b21db',1,'dense_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_dense_weighted_codegen_cuda.cu'],['../embedding__backward__dense__host_8cpp.html#a4e4e521f171d17c5d78bee2b3c9b21db',1,'dense_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_dense_weighted_codegen_cuda.cu']]], + ['dense_5fembedding_5fcodegen_5fforward_5fweighted_5fmeta_19',['dense_embedding_codegen_forward_weighted_meta',['../gen__embedding__forward__dense__weighted__codegen__meta_8cpp.html#ac89d0c2dc36fc6053f0425a919711b3a',1,'gen_embedding_forward_dense_weighted_codegen_meta.cpp']]], + ['dense_5fembedding_5fcodegen_5fgrad_5findice_5fweights_5fcuda_20',['dense_embedding_codegen_grad_indice_weights_cuda',['../gen__embedding__backward__dense__indice__weights__codegen__cuda_8cu.html#aa413d80f0ebbadd4375b29cfb27654b3',1,'dense_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &feature_requires_grad): gen_embedding_backward_dense_indice_weights_codegen_cuda.cu'],['../embedding__backward__dense__host_8cpp.html#aa413d80f0ebbadd4375b29cfb27654b3',1,'dense_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &feature_requires_grad): gen_embedding_backward_dense_indice_weights_codegen_cuda.cu']]], + ['dense_5fembedding_5fnobag_5fcodegen_5fforward_5funweighted_5fcuda_21',['dense_embedding_nobag_codegen_forward_unweighted_cuda',['../gen__embedding__forward__dense__unweighted__codegen__cuda_8cu.html#aadd3974603c08fba6a7c21638a57e7f4',1,'dense_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_dense_unweighted_codegen_cuda.cu'],['../embedding__backward__dense__host_8cpp.html#aadd3974603c08fba6a7c21638a57e7f4',1,'dense_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_dense_unweighted_codegen_cuda.cu']]], + ['dense_5fembedding_5fnobag_5fcodegen_5fforward_5funweighted_5fmeta_22',['dense_embedding_nobag_codegen_forward_unweighted_meta',['../gen__embedding__forward__dense__unweighted__codegen__meta_8cpp.html#ac9b06d5bef944e3f22c1b7d5faf0cc73',1,'gen_embedding_forward_dense_unweighted_codegen_meta.cpp']]], + ['dense_5fsegment_5fvalue_5fdata_23',['dense_segment_value_data',['../namespacefbgemm__gpu.html#a2f93c0df9186a239cfd59505a464fc36',1,'fbgemm_gpu']]], + ['dense_5fto_5fjagged_24',['dense_to_jagged',['../group__jagged-tensor-ops-cpu.html#gae25fa8a028fc083f06e445e1d2ebb208',1,'fbgemm_gpu']]], + ['dense_5fto_5fjagged_5fforward_25',['dense_to_jagged_forward',['../namespacefbgemm__gpu.html#aa5a76157eb45b9bd4159a548e8a73ce6',1,'fbgemm_gpu']]], + ['dense_5fto_5fjagged_5fforward_2ecu_26',['dense_to_jagged_forward.cu',['../dense__to__jagged__forward_8cu.html',1,'']]], + ['dequantize_5fload_27',['dequantize_load',['../namespacefbgemm__gpu.html#aee340827dbc6c104a400c30f47f3ee3b',1,'fbgemm_gpu::dequantize_load(const src_t *value, const float2)'],['../namespacefbgemm__gpu.html#a74358134402be54c82696697fe766b9a',1,'fbgemm_gpu::dequantize_load(const uint8_t *value, const float2 qparams)'],['../namespacefbgemm__gpu.html#aaed854f05a4542637ac342bfab57bdc7',1,'fbgemm_gpu::dequantize_load(const uint8_t *value, const float2 qparams)']]], + ['dequantize_5fpacked_5fhfp8_28',['dequantize_packed_hfp8',['../namespacefbgemm__gpu.html#a0c388276a962d14b3070dc55202eaf66',1,'fbgemm_gpu']]], + ['dequantize_5fpermuted_5fint2_29',['dequantize_permuted_int2',['../namespacefbgemm__gpu.html#a96be7f5b4c81d93bf024348e7b85e364',1,'fbgemm_gpu']]], + ['dequantize_5fpermuted_5fint4_30',['dequantize_permuted_int4',['../namespacefbgemm__gpu.html#a2cf47d59251a0840fd370a95fa371681',1,'fbgemm_gpu']]], + ['dequantize_5fpermuted_5fint8_31',['dequantize_permuted_int8',['../namespacefbgemm__gpu.html#adec3504b0909c4380da3c0aac89055de',1,'fbgemm_gpu']]], + ['dev_5fweights_32',['dev_weights',['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html#a6d8072fe7f1cbd1cf456e3ea8a440ad3',1,'dev_weights: gen_batch_index_select_dim0_forward_kernel_small.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html#ac251990f6a37927ea6f8c58584ec7a4c',1,'dev_weights: gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html#ac251990f6a37927ea6f8c58584ec7a4c',1,'dev_weights: gen_embedding_forward_split_unweighted_nobag_kernel_small.cu'],['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#a7ac7f1200f9cc67310a434e6da2bc8ae',1,'dev_weights: gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['device_33',['DEVICE',['../namespacefbgemm__gpu.html#a8f04cbe33fa88d1e420c06b1f8879194ae10b6ab6a278644ce40631f62f360b6d',1,'fbgemm_gpu']]], + ['device_5finline_34',['DEVICE_INLINE',['../fbgemm__cuda__utils_8cuh.html#a8888b6e919f4a14975d3110a7425407d',1,'fbgemm_cuda_utils.cuh']]], + ['dim_5f_35',['dim_',['../structfbgemm__gpu_1_1_weight_row.html#a844805bf936642eb8849d76b506abf8d',1,'fbgemm_gpu::WeightRow']]], + ['direct_5fmapped_5flru_5fcache_5fpopulate_5fbyte_5fcpu_36',['direct_mapped_lru_cache_populate_byte_cpu',['../namespacefbgemm__gpu.html#ac827cf6cd0f063a6747deaff14e4902d',1,'fbgemm_gpu']]], + ['direct_5fmapped_5flru_5fcache_5fpopulate_5fbyte_5fcuda_37',['direct_mapped_lru_cache_populate_byte_cuda',['../group__table-batched-embed-cuda.html#gae019b6879bd9f89a146e0700d5a4bd8b',1,'direct_mapped_lru_cache_populate_byte_cuda(at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, at::Tensor lxu_cache_miss_timestamp, int64_t row_alignment, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats): split_embeddings_cache_cuda.cuh'],['../lru__cache__populate__byte_8cu.html#ab944b6f7e1df36b8ef0c4a911c1b0afb',1,'direct_mapped_lru_cache_populate_byte_cuda(Tensor weights, Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor linear_cache_indices, Tensor lxu_cache_state, Tensor lxu_cache_weights, int64_t time_stamp, Tensor lru_state, Tensor lxu_cache_miss_timestamp, int64_t row_alignment, bool gather_cache_stats, c10::optional< Tensor > uvm_cache_stats): lru_cache_populate_byte.cu']]], + ['direct_5fmapped_5flxu_5fcache_5flookup_5fcpu_38',['direct_mapped_lxu_cache_lookup_cpu',['../namespacefbgemm__gpu.html#a03949dd527b81758e43a4b48800c3bc6',1,'fbgemm_gpu']]], + ['direct_5fmapped_5flxu_5fcache_5flookup_5fcuda_39',['direct_mapped_lxu_cache_lookup_cuda',['../group__table-batched-embed-cuda.html#gab305ebdd3822794c5ac462bf5df4bb49',1,'direct_mapped_lxu_cache_lookup_cuda(at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats): split_embeddings_cache_cuda.cuh'],['../lxu__cache_8cu.html#a9a01f6df03e867e1871df306a6289e06',1,'direct_mapped_lxu_cache_lookup_cuda(Tensor linear_cache_indices, Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, c10::optional< Tensor > uvm_cache_stats): lxu_cache.cu']]], + ['dispatch_5fdense_5fto_5fjagged_5fcase_40',['DISPATCH_DENSE_TO_JAGGED_CASE',['../dense__to__jagged__forward_8cu.html#ab94a3e4679ece26e229ec76dc9733ca2',1,'dense_to_jagged_forward.cu']]], + ['dispatch_5femb_5fcache_5foutput_5ftypes_41',['DISPATCH_EMB_CACHE_OUTPUT_TYPES',['../dispatch__macros_8h.html#a8f06a63f75524d1985d76648b0fcf990',1,'dispatch_macros.h']]], + ['dispatch_5femb_5fcache_5ftypes_42',['DISPATCH_EMB_CACHE_TYPES',['../dispatch__macros_8h.html#ac4599e1c46b6eb357145dd791c6ae5c9',1,'dispatch_macros.h']]], + ['dispatch_5femb_5fgrad_5fcache_5ftypes_43',['DISPATCH_EMB_GRAD_CACHE_TYPES',['../dispatch__macros_8h.html#a10b99a9b7edecc89f4558ba0cf37c0ee',1,'dispatch_macros.h']]], + ['dispatch_5fkernel_5ffor_5fcache_5fcase_44',['DISPATCH_KERNEL_FOR_CACHE_CASE',['../gen__batch__index__select__dim0__forward__codegen__cuda_8cu.html#a285553bb10df1164c041a1cb931b44a8',1,'DISPATCH_KERNEL_FOR_CACHE_CASE: gen_batch_index_select_dim0_forward_codegen_cuda.cu'],['../gen__embedding__forward__dense__unweighted__codegen__cuda_8cu.html#a285553bb10df1164c041a1cb931b44a8',1,'DISPATCH_KERNEL_FOR_CACHE_CASE: gen_embedding_forward_dense_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__dense__weighted__codegen__cuda_8cu.html#a285553bb10df1164c041a1cb931b44a8',1,'DISPATCH_KERNEL_FOR_CACHE_CASE: gen_embedding_forward_dense_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#a285553bb10df1164c041a1cb931b44a8',1,'DISPATCH_KERNEL_FOR_CACHE_CASE: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#a285553bb10df1164c041a1cb931b44a8',1,'DISPATCH_KERNEL_FOR_CACHE_CASE: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#a285553bb10df1164c041a1cb931b44a8',1,'DISPATCH_KERNEL_FOR_CACHE_CASE: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#a285553bb10df1164c041a1cb931b44a8',1,'DISPATCH_KERNEL_FOR_CACHE_CASE: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../embedding__forward__split__template_8cu.html#a285553bb10df1164c041a1cb931b44a8',1,'DISPATCH_KERNEL_FOR_CACHE_CASE: embedding_forward_split_template.cu']]], + ['dispatch_5fmacros_2eh_45',['dispatch_macros.h',['../dispatch__macros_8h.html',1,'']]], + ['dispatch_5foptimal_5fforward_5fkernel_46',['DISPATCH_OPTIMAL_FORWARD_KERNEL',['../gen__batch__index__select__dim0__forward__codegen__cuda_8cu.html#abe51720e514c6a9d39c95bc2c72e1cd6',1,'DISPATCH_OPTIMAL_FORWARD_KERNEL: gen_batch_index_select_dim0_forward_codegen_cuda.cu'],['../gen__embedding__forward__dense__unweighted__codegen__cuda_8cu.html#abe51720e514c6a9d39c95bc2c72e1cd6',1,'DISPATCH_OPTIMAL_FORWARD_KERNEL: gen_embedding_forward_dense_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__dense__weighted__codegen__cuda_8cu.html#abe51720e514c6a9d39c95bc2c72e1cd6',1,'DISPATCH_OPTIMAL_FORWARD_KERNEL: gen_embedding_forward_dense_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#abe51720e514c6a9d39c95bc2c72e1cd6',1,'DISPATCH_OPTIMAL_FORWARD_KERNEL: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#abe51720e514c6a9d39c95bc2c72e1cd6',1,'DISPATCH_OPTIMAL_FORWARD_KERNEL: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#abe51720e514c6a9d39c95bc2c72e1cd6',1,'DISPATCH_OPTIMAL_FORWARD_KERNEL: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#abe51720e514c6a9d39c95bc2c72e1cd6',1,'DISPATCH_OPTIMAL_FORWARD_KERNEL: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../embedding__forward__split__template_8cu.html#abe51720e514c6a9d39c95bc2c72e1cd6',1,'DISPATCH_OPTIMAL_FORWARD_KERNEL: embedding_forward_split_template.cu']]], + ['dispatch_5foptimal_5fkernel_47',['DISPATCH_OPTIMAL_KERNEL',['../gen__batch__index__select__dim0__backward__codegen__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_batch_index_select_dim0_backward_codegen_cuda.cu'],['../gen__embedding__backward__adagrad__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__adagrad__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__adam__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_adam_split_unweighted_cuda.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_adam_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__adam__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_adam_split_weighted_cuda.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu'],['../gen__embedding__backward__dense__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_dense_split_unweighted_cuda.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_dense_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__dense__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_dense_split_weighted_cuda.cu'],['../gen__embedding__backward__lamb__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_lamb_split_unweighted_cuda.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_lamb_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__lamb__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_lamb_split_weighted_cuda.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_lars_sgd_split_unweighted_cuda.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_lars_sgd_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_lars_sgd_split_weighted_cuda.cu'],['../gen__embedding__backward__none__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_none_split_unweighted_cuda.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_none_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__none__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_none_split_weighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_partial_rowwise_adam_split_unweighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_partial_rowwise_adam_split_weighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_partial_rowwise_lamb_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__sgd__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_sgd_split_unweighted_cuda.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_sgd_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_sgd_split_unweighted_vbe_cuda.cu'],['../gen__embedding__backward__sgd__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_sgd_split_weighted_cuda.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_sgd_split_weighted_vbe_cuda.cu'],['../embedding__backward__split__template_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: embedding_backward_split_template.cu'],['../embedding__backward__split__template_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: embedding_backward_split_template.cu']]], + ['dispatch_5foptimal_5fnobag_5fforward_5fkernel_48',['DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL',['../gen__batch__index__select__dim0__forward__codegen__cuda_8cu.html#a805da9b1e5a1c6e28a4d4c99501d1b1a',1,'DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL: gen_batch_index_select_dim0_forward_codegen_cuda.cu'],['../gen__embedding__forward__dense__unweighted__codegen__cuda_8cu.html#a805da9b1e5a1c6e28a4d4c99501d1b1a',1,'DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL: gen_embedding_forward_dense_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__dense__weighted__codegen__cuda_8cu.html#a805da9b1e5a1c6e28a4d4c99501d1b1a',1,'DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL: gen_embedding_forward_dense_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#a805da9b1e5a1c6e28a4d4c99501d1b1a',1,'DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#a805da9b1e5a1c6e28a4d4c99501d1b1a',1,'DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#a805da9b1e5a1c6e28a4d4c99501d1b1a',1,'DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#a805da9b1e5a1c6e28a4d4c99501d1b1a',1,'DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../embedding__forward__split__template_8cu.html#a805da9b1e5a1c6e28a4d4c99501d1b1a',1,'DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL: embedding_forward_split_template.cu']]], + ['dispatch_5foutput_5ftypes_49',['DISPATCH_OUTPUT_TYPES',['../dispatch__macros_8h.html#a91c270ea1cbf887747abbaf8883b7175',1,'dispatch_macros.h']]], + ['dispatch_5fto_5fall_50',['DISPATCH_TO_ALL',['../sparse__ops__utils_8h.html#ae80e8b33bdef7d2849eb3d516ff67d1b',1,'sparse_ops_utils.h']]], + ['dispatch_5fto_5fautograd_51',['DISPATCH_TO_AUTOGRAD',['../sparse__ops__utils_8h.html#aab6390a9590ead03a896aae2b93a96ed',1,'sparse_ops_utils.h']]], + ['dispatch_5fto_5fautograd_5fcuda_52',['DISPATCH_TO_AUTOGRAD_CUDA',['../sparse__ops__utils_8h.html#adb242971e11b66b1f8f58c361e44b8e7',1,'sparse_ops_utils.h']]], + ['dispatch_5fto_5fautograd_5fmeta_53',['DISPATCH_TO_AUTOGRAD_META',['../sparse__ops__utils_8h.html#a8ed65710de63bd56275d2ceded5d59b4',1,'sparse_ops_utils.h']]], + ['dispatch_5fto_5fcpu_54',['DISPATCH_TO_CPU',['../sparse__ops__utils_8h.html#af5cf39897136f04c6f2ac5f3544c49c3',1,'sparse_ops_utils.h']]], + ['dispatch_5fto_5fcuda_55',['DISPATCH_TO_CUDA',['../sparse__ops__utils_8h.html#a06de50f3ede518ff59612c9ada5a85c8',1,'sparse_ops_utils.h']]], + ['dispatch_5fto_5fmeta_56',['DISPATCH_TO_META',['../sparse__ops__utils_8h.html#aa751218a0e9119ad6fa4d6d4df63fda5',1,'sparse_ops_utils.h']]], + ['div_57',['div',['../structfbgemm__gpu_1_1_vec4_acc_t.html#a36a62a848632d6968fe6723ee19277da',1,'fbgemm_gpu::Vec4AccT']]], + ['div_58',['Div',['../classfbgemm__gpu_1_1_fixed_divisor.html#a74e5cb4569d6d48cbc0ee32674a7e374',1,'fbgemm_gpu::FixedDivisor']]], + ['div_5fround_5fup_59',['DIV_ROUND_UP',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a0f2b77785cbc55639ba4e4874a65426c',1,'DIV_ROUND_UP: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a0f2b77785cbc55639ba4e4874a65426c',1,'DIV_ROUND_UP: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a0f2b77785cbc55639ba4e4874a65426c',1,'DIV_ROUND_UP: embedding_forward_split_kernel_v2_template.cu']]], + ['div_5fround_5fup_60',['div_round_up',['../namespacenbit.html#a620ba1c7dba3e279e09759758b7a86db',1,'nbit::div_round_up()'],['../namespacefbgemm__gpu.html#a1e5f0f7703057bbda166a7723b16e6ef',1,'fbgemm_gpu::div_round_up()']]], + ['divmod_61',['DivMod',['../classfbgemm__gpu_1_1_fixed_divisor.html#abea2bdfe3649f1b944a15453e78ae523',1,'fbgemm_gpu::FixedDivisor::DivMod()'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a31faa05b32d14aec34e66800b6092329',1,'DivMod(global_warp_id, &t, &table_warp_id): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a31faa05b32d14aec34e66800b6092329',1,'DivMod(global_warp_id, &t, &table_warp_id): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../namespacefbgemm__gpu.html#aef6bada16cf81832eb1e594eb47875d8',1,'fbgemm_gpu::DivMod()']]], + ['dll_5fpublic_62',['DLL_PUBLIC',['../ops__utils_8h.html#a29047de4dfe891435d8254535634ac1d',1,'ops_utils.h']]], + ['do_5fwrite_63',['do_write',['../bench__utils_8cuh.html#af01122d304bbe0308fe6c59bebe33730',1,'bench_utils.cuh']]], + ['dummy_5fpacked_5faccessor32_64',['dummy_packed_accessor32',['../namespacefbgemm__gpu.html#a86a8cc18b54f6986ec4faeec0b223907',1,'fbgemm_gpu']]], + ['dummy_5fpacked_5faccessor64_65',['dummy_packed_accessor64',['../namespacefbgemm__gpu.html#aeb6f64d8ceb0189b03aa6808b97e8b16',1,'fbgemm_gpu']]] ]; diff --git a/search/all_5.js b/search/all_5.js index cf90c2986..4f397c881 100644 --- a/search/all_5.js +++ b/search/all_5.js @@ -1,5 +1,67 @@ var searchData= [ - ['generic_5fhistogram_5fbinning_5fcalibration_5fby_5ffeature_5fcpu_0',['generic_histogram_binning_calibration_by_feature_cpu',['../group__sparse-data-cpu.html#gaef2a0a8c27e3b8b2d72be5c95ba7539e',1,'fbgemm_gpu']]], - ['get_5funique_5findices_5fcuda_1',['get_unique_indices_cuda',['../group__table-batched-embed-cuda.html#ga4887151424a90cfd0abef174a4e91f3f',1,'get_unique_indices_cuda(at::Tensor linear_indices, int64_t max_indices, bool compute_count): linearize_cache_indices.cu'],['../group__table-batched-embed-cuda.html#ga4887151424a90cfd0abef174a4e91f3f',1,'get_unique_indices_cuda(Tensor linear_indices, int64_t max_indices, bool compute_count): linearize_cache_indices.cu']]] + ['element_5fwise_5fmul_5f_0',['element_wise_mul_',['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a8c36671f882604ae41f214e978ebf04b',1,'fbgemm_gpu::Vec4T< float >::element_wise_mul_()'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a8c36671f882604ae41f214e978ebf04b',1,'fbgemm_gpu::Vec4T< at::Half >::element_wise_mul_(const Vec4T< float > &a)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#ae653589cf39f92811f8509363515532d',1,'fbgemm_gpu::Vec4T< at::Half >::element_wise_mul_(const Vec4T< at::Half > &a)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a8c36671f882604ae41f214e978ebf04b',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::element_wise_mul_(const Vec4T< float > &a)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#ae653589cf39f92811f8509363515532d',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::element_wise_mul_(const Vec4T< at::Half > &a)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a077873e0dd3516731c2302c7b3dee475',1,'fbgemm_gpu::Vec4T< double >::element_wise_mul_()']]], + ['else_1',['else',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a0544c3fe466e421738dae463968b70ba',1,'else: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../namespacefbgemm__gpu.html#a0544c3fe466e421738dae463968b70ba',1,'fbgemm_gpu::else']]], + ['embedding_20cpu_20operators_2',['Embedding CPU Operators',['../group__embedding-cpu.html',1,'']]], + ['embedding_20cuda_20operators_3',['Embedding CUDA Operators',['../group__embedding-cuda.html',1,'']]], + ['embedding_5fbackward_5fdense_5fhost_2ecpp_4',['embedding_backward_dense_host.cpp',['../embedding__backward__dense__host_8cpp.html',1,'']]], + ['embedding_5fbackward_5fdense_5fhost_5fcpu_2ecpp_5',['embedding_backward_dense_host_cpu.cpp',['../embedding__backward__dense__host__cpu_8cpp.html',1,'']]], + ['embedding_5fbackward_5fsplit_5fcpu_5fapprox_5ftemplate_2ecpp_6',['embedding_backward_split_cpu_approx_template.cpp',['../embedding__backward__split__cpu__approx__template_8cpp.html',1,'']]], + ['embedding_5fbackward_5fsplit_5fcpu_5ftemplate_2ecpp_7',['embedding_backward_split_cpu_template.cpp',['../embedding__backward__split__cpu__template_8cpp.html',1,'']]], + ['embedding_5fbackward_5fsplit_5fgrad_5ftemplate_2ecu_8',['embedding_backward_split_grad_template.cu',['../embedding__backward__split__grad__template_8cu.html',1,'']]], + ['embedding_5fbackward_5fsplit_5fhost_5fcpu_5ftemplate_2ecpp_9',['embedding_backward_split_host_cpu_template.cpp',['../embedding__backward__split__host__cpu__template_8cpp.html',1,'']]], + ['embedding_5fbackward_5fsplit_5fhost_5ftemplate_2ecpp_10',['embedding_backward_split_host_template.cpp',['../embedding__backward__split__host__template_8cpp.html',1,'']]], + ['embedding_5fbackward_5fsplit_5findice_5fweights_5ftemplate_2ecu_11',['embedding_backward_split_indice_weights_template.cu',['../embedding__backward__split__indice__weights__template_8cu.html',1,'']]], + ['embedding_5fbackward_5fsplit_5fkernel_5fcta_5ftemplate_2ecu_12',['embedding_backward_split_kernel_cta_template.cu',['../embedding__backward__split__kernel__cta__template_8cu.html',1,'']]], + ['embedding_5fbackward_5fsplit_5fkernel_5fwarp_5ftemplate_2ecu_13',['embedding_backward_split_kernel_warp_template.cu',['../embedding__backward__split__kernel__warp__template_8cu.html',1,'']]], + ['embedding_5fbackward_5fsplit_5ftemplate_2ecu_14',['embedding_backward_split_template.cu',['../embedding__backward__split__template_8cu.html',1,'']]], + ['embedding_5fbackward_5ftemplate_5fhelpers_2ecuh_15',['embedding_backward_template_helpers.cuh',['../embedding__backward__template__helpers_8cuh.html',1,'']]], + ['embedding_5fbag_5frowwise_5fprune_16',['embedding_bag_rowwise_prune',['../namespacefbgemm__gpu.html#ae586c9948dba8a67abf44ada58425fba',1,'fbgemm_gpu']]], + ['embedding_5fbounds_5fcheck_2ecu_17',['embedding_bounds_check.cu',['../embedding__bounds__check_8cu.html',1,'']]], + ['embedding_5fbounds_5fcheck_5fhost_2ecpp_18',['embedding_bounds_check_host.cpp',['../embedding__bounds__check__host_8cpp.html',1,'']]], + ['embedding_5fbounds_5fcheck_5fhost_5fcpu_2ecpp_19',['embedding_bounds_check_host_cpu.cpp',['../embedding__bounds__check__host__cpu_8cpp.html',1,'']]], + ['embedding_5fcommon_2eh_20',['embedding_common.h',['../embedding__common_8h.html',1,'']]], + ['embedding_5fforward_5fquantized_5fcpu_5ftemplate_2ecpp_21',['embedding_forward_quantized_cpu_template.cpp',['../embedding__forward__quantized__cpu__template_8cpp.html',1,'']]], + ['embedding_5fforward_5fquantized_5fhost_2ecpp_22',['embedding_forward_quantized_host.cpp',['../embedding__forward__quantized__host_8cpp.html',1,'']]], + ['embedding_5fforward_5fquantized_5fhost_5fcpu_2ecpp_23',['embedding_forward_quantized_host_cpu.cpp',['../embedding__forward__quantized__host__cpu_8cpp.html',1,'']]], + ['embedding_5fforward_5fquantized_5fsplit_5flookup_2ecu_24',['embedding_forward_quantized_split_lookup.cu',['../embedding__forward__quantized__split__lookup_8cu.html',1,'']]], + ['embedding_5fforward_5fquantized_5fsplit_5fnbit_5fhost_5ftemplate_2ecu_25',['embedding_forward_quantized_split_nbit_host_template.cu',['../embedding__forward__quantized__split__nbit__host__template_8cu.html',1,'']]], + ['embedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5ftemplate_2ecu_26',['embedding_forward_quantized_split_nbit_kernel_template.cu',['../embedding__forward__quantized__split__nbit__kernel__template_8cu.html',1,'']]], + ['embedding_5fforward_5fsplit_5fcpu_2ecpp_27',['embedding_forward_split_cpu.cpp',['../embedding__forward__split__cpu_8cpp.html',1,'']]], + ['embedding_5fforward_5fsplit_5fcpu_2eh_28',['embedding_forward_split_cpu.h',['../embedding__forward__split__cpu_8h.html',1,'']]], + ['embedding_5fforward_5fsplit_5fkernel_5fnobag_5fsmall_5ftemplate_2ecu_29',['embedding_forward_split_kernel_nobag_small_template.cu',['../embedding__forward__split__kernel__nobag__small__template_8cu.html',1,'']]], + ['embedding_5fforward_5fsplit_5fkernel_5ftemplate_2ecu_30',['embedding_forward_split_kernel_template.cu',['../embedding__forward__split__kernel__template_8cu.html',1,'']]], + ['embedding_5fforward_5fsplit_5fkernel_5fv2_5ftemplate_2ecu_31',['embedding_forward_split_kernel_v2_template.cu',['../embedding__forward__split__kernel__v2__template_8cu.html',1,'']]], + ['embedding_5fforward_5fsplit_5fmeta_5ftemplate_2ecpp_32',['embedding_forward_split_meta_template.cpp',['../embedding__forward__split__meta__template_8cpp.html',1,'']]], + ['embedding_5fforward_5fsplit_5ftemplate_2ecu_33',['embedding_forward_split_template.cu',['../embedding__forward__split__template_8cu.html',1,'']]], + ['embedding_5fforward_5ftemplate_5fhelpers_2ecuh_34',['embedding_forward_template_helpers.cuh',['../embedding__forward__template__helpers_8cuh.html',1,'']]], + ['embedding_5finplace_5fupdate_2ecu_35',['embedding_inplace_update.cu',['../embedding__inplace__update_8cu.html',1,'']]], + ['embedding_5finplace_5fupdate_2eh_36',['embedding_inplace_update.h',['../embedding__inplace__update_8h.html',1,'']]], + ['embedding_5finplace_5fupdate_5fcpu_37',['embedding_inplace_update_cpu',['../namespacefbgemm__gpu.html#aaa1807fa25793e61743b75d27db063cc',1,'fbgemm_gpu']]], + ['embedding_5finplace_5fupdate_5fcpu_2ecpp_38',['embedding_inplace_update_cpu.cpp',['../embedding__inplace__update__cpu_8cpp.html',1,'']]], + ['embedding_5finplace_5fupdate_5fcpu_5fkernel_39',['embedding_inplace_update_cpu_kernel',['../namespacefbgemm__gpu.html#af3e9e1ce0f6340f233ef6ae8934454cf',1,'fbgemm_gpu']]], + ['embedding_5finplace_5fupdate_5fcuda_40',['embedding_inplace_update_cuda',['../namespacefbgemm__gpu.html#a54bf7e9b54b5263cf039100cda517c34',1,'fbgemm_gpu']]], + ['embedding_5finplace_5fupdate_5fgpu_2ecpp_41',['embedding_inplace_update_gpu.cpp',['../embedding__inplace__update__gpu_8cpp.html',1,'']]], + ['embedding_5finplace_5fupdate_5ftest_2ecpp_42',['embedding_inplace_update_test.cpp',['../embedding__inplace__update__test_8cpp.html',1,'']]], + ['embedding_5fop_5fregistration_2eh_43',['embedding_op_registration.h',['../embedding__op__registration_8h.html',1,'']]], + ['embedding_5fops_5fplaceholder_2ecpp_44',['embedding_ops_placeholder.cpp',['../embedding__ops__placeholder_8cpp.html',1,'']]], + ['embedding_5foptimizer_5fsplit_5fdevice_5fkernel_5ftemplate_2ecuh_45',['embedding_optimizer_split_device_kernel_template.cuh',['../embedding__optimizer__split__device__kernel__template_8cuh.html',1,'']]], + ['embedding_5foptimizer_5fsplit_5fhost_5ftemplate_2ecpp_46',['embedding_optimizer_split_host_template.cpp',['../embedding__optimizer__split__host__template_8cpp.html',1,'']]], + ['embedding_5foptimizer_5fsplit_5fkernel_5ftemplate_2ecu_47',['embedding_optimizer_split_kernel_template.cu',['../embedding__optimizer__split__kernel__template_8cu.html',1,'']]], + ['embedding_5foptimizer_5fsplit_5ftemplate_2ecu_48',['embedding_optimizer_split_template.cu',['../embedding__optimizer__split__template_8cu.html',1,'']]], + ['embeddingrocksdb_49',['EmbeddingRocksDB',['../classssd_1_1_embedding_rocks_d_b.html',1,'EmbeddingRocksDB'],['../classssd_1_1_embedding_rocks_d_b.html#a703b26ce10b84fa35ea496114f1ebbb5',1,'ssd::EmbeddingRocksDB::EmbeddingRocksDB()']]], + ['embeddings_20operators_20cpu_50',['Permute Pooled Embeddings Operators (CPU)',['../group__permute-pooled-embs-cpu.html',1,'']]], + ['embeddings_20operators_20cuda_51',['Permute Pooled Embeddings Operators (CUDA)',['../group__permute-pooled-embs-gpu.html',1,'']]], + ['emulate_5fcache_5fmiss_52',['emulate_cache_miss',['../split__embeddings__cache__cuda_8cuh.html#a8f112d04838c2019df06ffbb84dbafba',1,'emulate_cache_miss(at::Tensor lxu_cache_locations, const int64_t enforced_misses_per_256, const bool gather_cache_stats, at::Tensor uvm_cache_stats): lru_cache_find.cu'],['../lru__cache__find_8cu.html#a8a80ce6ea3d62b9f22ac391767b34538',1,'emulate_cache_miss(Tensor lxu_cache_locations, const int64_t enforced_misses_per_256, const bool gather_cache_stats, Tensor uvm_cache_stats): lru_cache_find.cu']]], + ['enum_5fitem_53',['enum_item',['../namespacefbgemm__gpu.html#aef71de4120929d2410f5d766948f8eaf',1,'fbgemm_gpu']]], + ['enum_5fitems_54',['enum_items',['../namespacefbgemm__gpu.html#a5fdc84ce2202ea07eb2e865847bd8f34',1,'fbgemm_gpu']]], + ['enum_5fquery_55',['enum_query',['../classfbgemm__gpu_1_1enum__registration.html#a84cad106fb24ea59687f6708d197cc64',1,'fbgemm_gpu::enum_registration']]], + ['enum_5fregistration_56',['enum_registration',['../classfbgemm__gpu_1_1enum__registration.html',1,'enum_registration< T >'],['../classfbgemm__gpu_1_1enum__registration.html#afa13a8542c6dde450214a387cacf3a9b',1,'fbgemm_gpu::enum_registration::enum_registration()']]], + ['enum_5fresult_57',['enum_result',['../namespacefbgemm__gpu.html#adbdc3251cbd2e995dfa31ffdf2c2df8e',1,'fbgemm_gpu']]], + ['enum_5futils_2eh_58',['enum_utils.h',['../enum__utils_8h.html',1,'']]], + ['eps_59',['eps',['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#a3af1a7fb1e7c77ec014031cd2e1d0837',1,'gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['evict_60',['evict',['../structfbgemm__gpu_1_1_weight_row.html#a64c9f91fe6b60f7294ce6bb363bdb234',1,'fbgemm_gpu::WeightRow']]], + ['exclusive_5fscan_5fptrs_5fcpu_61',['exclusive_scan_ptrs_cpu',['../namespacefbgemm__gpu.html#aa8eb0fcd765dc4580084f6d098604e0d',1,'fbgemm_gpu']]], + ['expand_5finto_5fjagged_5fpermute_5fcpu_62',['expand_into_jagged_permute_cpu',['../namespacefbgemm__gpu.html#a02fab30a12d9d6ee6e6ae68bc8041481',1,'fbgemm_gpu']]], + ['expand_5finto_5fjagged_5fpermute_5fcuda_63',['expand_into_jagged_permute_cuda',['../group__sparse-data-cuda.html#ga2402de1c0102b21af5f2bd5a50d30309',1,'fbgemm_gpu']]] ]; diff --git a/search/all_6.js b/search/all_6.js index 46cb13feb..baddf1d79 100644 --- a/search/all_6.js +++ b/search/all_6.js @@ -1,5 +1,74 @@ var searchData= [ - ['histogram_5fbinning_5fcalibration_5fcpu_0',['histogram_binning_calibration_cpu',['../group__sparse-data-cpu.html#ga201bb2241fc9d582d6c0fe968b0e71ca',1,'fbgemm_gpu']]], - ['host_5flxu_5fcache_5fslot_1',['host_lxu_cache_slot',['../group__table-batched-embed-cuda.html#ga920da453c443675fc7fbc9d68e272a61',1,'host_lxu_cache_slot(int64_t h_in, int64_t C): lxu_cache.cu'],['../group__table-batched-embed-cuda.html#ga920da453c443675fc7fbc9d68e272a61',1,'host_lxu_cache_slot(int64_t h_in, int64_t C): lxu_cache.cu']]] + ['false_0',['false',['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a0ad31f76c1f9349ef8b21ca138e897cc',1,'false: gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a0ad31f76c1f9349ef8b21ca138e897cc',1,'false: gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a0ad31f76c1f9349ef8b21ca138e897cc',1,'false: gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#a0ad31f76c1f9349ef8b21ca138e897cc',1,'false: gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a0ad31f76c1f9349ef8b21ca138e897cc',1,'false: gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__backward__split__grad_8cu.html#a05118d1db073d73fe80ee01b40791cf6',1,'false(const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > dev_or_uvm_unique_indices, const int info_B_num_bits): gen_embedding_backward_split_grad.cu'],['../namespacenbit.html#af9110ca4f61dbcc64cf0f8118cdc97f1',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a249c23ff8c01f39126136bc2539952fe',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a508c0bc5d94dee1c736f755730ca2beb',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a4c70aaadd08c9449d6cedae3e20ea68c',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a90040b4a20a116df4d0c66c160e6e764',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#aab2d7afb4b654ce45cfc2748e78ac253',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a308832faa1970c724a5589233e352f17',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a04aec5313af7eaae824c4738345d4b6a',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a741f318d94db0cb3578afea1e4630cc9',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a7fd32cfedb1f12bb236748026afb62f0',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a6798d1239a1e727f202aa623317a936c',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#ae6208ce34aaecc5de1eea88805352dda',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a0b028a0d4eab6f827b0747e791479111',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#ab5d4641eabcd497e393236456c66f662',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a92aa5aa305b64d0be3324318e749f727',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a1628074b31c14dcc07fd3d859e9ddf89',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a6502e80c3fcff2fd9816c54de76346c5',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a62b93a28ed713cca24870802bd016e03',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a59ea73f8b7947242291927c972ebf040',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a0525091bf8439436819eef72a5c45ca6',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#ae661502dfcff9025fb909b009a194e2f',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a4ef67d9b7b4ba3292ad30493c9daf596',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a3f1b79dd7ed41442b0dfb240f2ab0ec7',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a18e19fee6513187e93010f11a932f6de',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a311bf35bff79e995c3e6d7d2e6a69952',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a8c5c41f01ea1d775126bc194e1e95ecc',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a16040890e0367b0669f51c05b4715ecd',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a8f19e545f5c45f11ee4c5898decb994c',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a54d26a841bc71bb0c9fdcb2f657d3058',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aaaa117179cc47a2a2fbdb86da6066081',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aff4f86de443efa79fda96f93b78b26d4',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a0c3c6fbc30353d25b4ada5dba7ed9ad3',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a71b5f71e99a903571a45d1bfb5dd6537',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a678e19ebc31d391a2101878805cfec04',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a5e71bf5354b291e99138e5b51a2c8987',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a39a233002f8c2aadb3206424d3cf33ed',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#afb14ab09e129e59e6e323cc8ad114e0a',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ac335cccca06f6bd0865b65bb20192a24',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a6a008e7d608ca15741939511b1f48878',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#acb117339908a6826b75877db094f909d',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#afbb29ff03c359916c050f25deac56e9e',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a1695088ded9f86314e0bc374c4ad57f9',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#af26c8601b994cb4ad7a7d08104ccc876',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ab84745c1fc3e4c483778cc8dc325eb7f',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a0b7156fcc5a6e05dd2ab1a0dd33f339d',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a5acddab9f4eec4c91ba1403005c3ec7d',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a79860db3c0c6c510a821d9ac0a4c6764',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a7ccf30944601039563603d837470824c',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['fatal_1',['FATAL',['../namespacefbgemm__gpu.html#a70433200cf584e2429434a33d45111eaa19da7170bea36556dde582519795f3fc',1,'fbgemm_gpu']]], + ['fbgemm_5fcuda_5futils_2ecuh_2',['fbgemm_cuda_utils.cuh',['../fbgemm__cuda__utils_8cuh.html',1,'']]], + ['fbgemm_5fdispatch_5ffloat_5fand_5fhalf_3',['FBGEMM_DISPATCH_FLOAT_AND_HALF',['../dispatch__macros_8h.html#a6db9b7506116844ae45993577c3b9ac4',1,'dispatch_macros.h']]], + ['fbgemm_5fdispatch_5ffloat_5fand_5fhalf_5fcase_4',['FBGEMM_DISPATCH_FLOAT_AND_HALF_CASE',['../dispatch__macros_8h.html#a60faa23c8d1bf9d75a2e598a5654ecff',1,'dispatch_macros.h']]], + ['fbgemm_5fdispatch_5ffloat_5fhalf_5fand_5fbfloat16_5',['FBGEMM_DISPATCH_FLOAT_HALF_AND_BFLOAT16',['../dispatch__macros_8h.html#ab9329efe2d7882cbc2bd358b6672c292',1,'dispatch_macros.h']]], + ['fbgemm_5fdispatch_5ffloat_5fhalf_5fand_5fbfloat16_5fcase_6',['FBGEMM_DISPATCH_FLOAT_HALF_AND_BFLOAT16_CASE',['../dispatch__macros_8h.html#a7c7e35b09a14b3d5b76339803712ce7e',1,'dispatch_macros.h']]], + ['fbgemm_5fgpu_7',['fbgemm_gpu',['../namespacefbgemm__gpu.html',1,'']]], + ['fbgemm_5fgpu_5fcub_5fns_5fprefix_8',['FBGEMM_GPU_CUB_NS_PREFIX',['../cub__namespace__postfix_8cuh.html#a12567f2486c4686871a5330dbd8e9bb4',1,'cub_namespace_postfix.cuh']]], + ['fbgemm_5fgpu_5fenum_5fcreate_5ftag_9',['FBGEMM_GPU_ENUM_CREATE_TAG',['../enum__utils_8h.html#a769a65d91133d4f233bcf10280ff7a3c',1,'enum_utils.h']]], + ['fbgemm_5fgpu_5fenum_5fglogal_10',['FBGEMM_GPU_ENUM_GLOGAL',['../enum__utils_8h.html#adc8e24189b6f5a58092ade0b27e197b1',1,'enum_utils.h']]], + ['fbgemm_5fgpu_5fenum_5fitem_11',['FBGEMM_GPU_ENUM_ITEM',['../enum__utils_8h.html#aef8d28be61e5e22bac45bf59c53dabbd',1,'enum_utils.h']]], + ['fbgemm_5fgpu_5fenum_5fop_12',['FBGEMM_GPU_ENUM_OP',['../enum__utils_8h.html#abcc6d46ce5e5452b5b49f96ae0aa67f3',1,'enum_utils.h']]], + ['fbgemm_5fgpu_5fenum_5fregister_5fend_13',['FBGEMM_GPU_ENUM_REGISTER_END',['../enum__utils_8h.html#a1fc46fffc78f3820ce4668b6b2a92b55',1,'enum_utils.h']]], + ['fbgemm_5fgpu_5fenum_5fregister_5fstart_14',['FBGEMM_GPU_ENUM_REGISTER_START',['../enum__utils_8h.html#a3c1089cc9b549d33d50c20c14b348950',1,'FBGEMM_GPU_ENUM_REGISTER_START: enum_utils.h'],['../namespacefbgemm__gpu.html#a0e41e402bfba1e346c6dcc610252e94b',1,'fbgemm_gpu::FBGEMM_GPU_ENUM_REGISTER_START()']]], + ['fbgemm_5fgpu_5fenum_5ftag_15',['FBGEMM_GPU_ENUM_TAG',['../enum__utils_8h.html#aae161db28429e0e2aa9001448f52e2f4',1,'enum_utils.h']]], + ['fbgemm_5fop_5fdispatch_16',['FBGEMM_OP_DISPATCH',['../ops__utils_8h.html#aed63a3f5bb9ae1c01f230bee2d95ea05',1,'FBGEMM_OP_DISPATCH: ops_utils.h'],['../batched__dense__vec__jagged__2d__mul__backward_8cu.html#a505e960fb46aaed90cbf00060c4f7f73',1,'FBGEMM_OP_DISPATCH(CUDA, "batched_dense_vec_jagged_2d_mul_backward", fbgemm_gpu::batched_dense_vec_jagged_2d_mul_backward): batched_dense_vec_jagged_2d_mul_backward.cu'],['../batched__dense__vec__jagged__2d__mul__forward_8cu.html#ae6d9314c75be8852a64432f06a618a51',1,'FBGEMM_OP_DISPATCH(CUDA, "batched_dense_vec_jagged_2d_mul_forward", fbgemm_gpu::batched_dense_vec_jagged_2d_mul_forward): batched_dense_vec_jagged_2d_mul_forward.cu'],['../dense__to__jagged__forward_8cu.html#a2f09e89f2172cc358cfffdc866220276',1,'FBGEMM_OP_DISPATCH(CUDA, "dense_to_jagged_forward", fbgemm_gpu::dense_to_jagged_forward): dense_to_jagged_forward.cu'],['../jagged__dense__bmm__forward_8cu.html#a10db24b3c6258b287f12eb591b6b1274',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_dense_bmm_forward", fbgemm_gpu::jagged_dense_bmm_forward_cuda): jagged_dense_bmm_forward.cu'],['../jagged__dense__dense__elementwise__add__jagged__output__forward_8cu.html#a4dc38a80ec480c8ba5e73920df40ade3',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_dense_dense_elementwise_add_jagged_output_forward", fbgemm_gpu::jagged_dense_dense_elementwise_add_jagged_output_forward): jagged_dense_dense_elementwise_add_jagged_output_forward.cu'],['../jagged__dense__elementwise__mul__backward_8cu.html#a56064ede1846b15cd7ee664d3ac0f447',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_dense_elementwise_mul_backward", fbgemm_gpu::jagged_dense_elementwise_mul_backward): jagged_dense_elementwise_mul_backward.cu'],['../jagged__dense__elementwise__mul__forward_8cu.html#a55ae1a4e6489decd594fc7c77fb36cd4',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_dense_elementwise_mul_forward", fbgemm_gpu::jagged_dense_elementwise_mul_forward): jagged_dense_elementwise_mul_forward.cu'],['../jagged__index__add__2d__forward_8cu.html#a6fbf3dbceb513f8dfa17d68303b4e1f1',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_index_add_2d_forward", fbgemm_gpu::jagged_index_add_2d_forward_cuda): jagged_index_add_2d_forward.cu'],['../jagged__index__select__2d__forward_8cu.html#a769ab9425e6b9229e5197a606072f7f7',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_index_select_2d_forward", fbgemm_gpu::jagged_index_select_2d_forward_cuda): jagged_index_select_2d_forward.cu'],['../jagged__jagged__bmm__forward_8cu.html#ad970c4b273bd75194ccced952b277f40',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_jagged_bmm_forward", fbgemm_gpu::jagged_jagged_bmm_forward_cuda): jagged_jagged_bmm_forward.cu'],['../jagged__softmax__backward_8cu.html#af86af3150ade27ed65bffd51e7fd389a',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_softmax_backward", fbgemm_gpu::jagged_softmax_backward_cuda): jagged_softmax_backward.cu'],['../jagged__softmax__forward_8cu.html#ad64b64d7d37e8e47389d74bbb5b9287f',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_softmax_forward", fbgemm_gpu::jagged_softmax_forward_cuda): jagged_softmax_forward.cu'],['../jagged__tensor__ops_8cu.html#ae9145e7dc8cdcfab08478c78e11806ee',1,'FBGEMM_OP_DISPATCH(CUDA, "dense_to_jagged", fbgemm_gpu::dense_to_jagged): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#af36ae71857641f82f406e9d03287e165',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_to_padded_dense", fbgemm_gpu::jagged_to_padded_dense): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#a63e1ce09a4f40dd4f79b7ceb985b2faf',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_dense_elementwise_add", fbgemm_gpu::jagged_dense_elementwise_add): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#a4f366150fd0ce1400047ea614232e9f8',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_dense_dense_elementwise_add_jagged_output", fbgemm_gpu::jagged_dense_dense_elementwise_add_jagged_output): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#afd6b82766bc27ff6c2e957e57ec2947e',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_dense_elementwise_mul", fbgemm_gpu::jagged_dense_elementwise_mul): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#a83e06ed43d316e587c86bd1b83a233a8',1,'FBGEMM_OP_DISPATCH(CUDA, "batched_dense_vec_jagged_2d_mul", fbgemm_gpu::batched_dense_vec_jagged_2d_mul): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#a5a65d954fda4f3313d036b22b3232872',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_1d_to_dense", fbgemm_gpu::jagged_1d_to_dense): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#a84c5e68f36966340db42aa25785290df',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_2d_to_dense", fbgemm_gpu::jagged_2d_to_dense): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#a6b3f90be325532b25c5df0c87c15e083',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_softmax", fbgemm_gpu::jagged_softmax): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#a56fea1ad733f259a42c89661e1bf2637',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_jagged_bmm", fbgemm_gpu::jagged_jagged_bmm): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#a48e6bd6975582a7ce4ceff6712fa6ef9',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_dense_bmm", fbgemm_gpu::jagged_dense_bmm): jagged_tensor_ops.cu'],['../jagged__to__padded__dense__backward_8cu.html#a0ec346f5fe59608b8e13809432c9a389',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_to_padded_dense_backward", fbgemm_gpu::jagged_to_padded_dense_backward): jagged_to_padded_dense_backward.cu'],['../jagged__to__padded__dense__forward_8cu.html#a1526839450b4cbf68a2d6a70673e273a',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_to_padded_dense_forward", fbgemm_gpu::jagged_to_padded_dense_forward): jagged_to_padded_dense_forward.cu'],['../jagged__to__padded__dense__forward_8cu.html#a9797a098549c8193d6beb70cb5d7da4f',1,'FBGEMM_OP_DISPATCH(CUDA, "stacked_jagged_1d_to_dense", fbgemm_gpu::stacked_jagged_1d_to_dense_gpu): jagged_to_padded_dense_forward.cu'],['../jagged__to__padded__dense__forward_8cu.html#a84d4e43e8339a03b14fe872dd3b2d50a',1,'FBGEMM_OP_DISPATCH(CUDA, "stacked_jagged_2d_to_dense", fbgemm_gpu::stacked_jagged_2d_to_dense_gpu): jagged_to_padded_dense_forward.cu'],['../jagged__to__padded__dense__forward_8cu.html#a61110a1a4f03edaa3322b245624b294e',1,'FBGEMM_OP_DISPATCH(CUDA, "stacked_jagged_2d_to_dense_forward", fbgemm_gpu::stacked_jagged_2d_to_dense_forward_cuda): jagged_to_padded_dense_forward.cu'],['../jagged__to__padded__dense__forward_8cu.html#a1a53264bb9ade4d2796b87a966ab450c',1,'FBGEMM_OP_DISPATCH(CUDA, "stacked_jagged_2d_to_dense_backward", fbgemm_gpu::stacked_jagged_2d_to_dense_backward_cuda): jagged_to_padded_dense_forward.cu'],['../jagged__to__padded__dense__forward_8cu.html#a65d732670fec1bee849caf445b2903e7',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_dense_elementwise_add_jagged_output", fbgemm_gpu::jagged_dense_elementwise_add_jagged_output_cuda): jagged_to_padded_dense_forward.cu'],['../jagged__unique__indices_8cu.html#a674314745cbd8dd913142d0660083851',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_unique_indices", fbgemm_gpu::jagged_unique_indices_cuda): jagged_unique_indices.cu'],['../jagged__unique__indices_8cu.html#aaf228a3ce26c3ae9c749573883b59be5',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_hash_size_cumsum", fbgemm_gpu::jagged_hash_size_cumsum_cuda): jagged_unique_indices.cu'],['../keyed__jagged__index__select__dim1_8cu.html#a69db0b3f600c7c45db29069cd05d3bea',1,'FBGEMM_OP_DISPATCH(CUDA, "keyed_jagged_index_select_dim1", fbgemm_gpu::keyed_jagged_index_select_dim_1_gpu): keyed_jagged_index_select_dim1.cu'],['../merge__pooled__embedding__ops__cpu_8cpp.html#a1ec90ab98c9d6c18099549dce392fd65',1,'FBGEMM_OP_DISPATCH(CPU, "merge_pooled_embeddings", fbgemm_gpu::merge_pooled_embeddings_cpu): merge_pooled_embedding_ops_cpu.cpp'],['../permute__pooled__embedding__ops__cpu_8cpp.html#a37755fb9333b1017d34b49ee0247004e',1,'FBGEMM_OP_DISPATCH(CPU, "permute_pooled_embs", fbgemm_gpu::permute_pooled_embs_cpu): permute_pooled_embedding_ops_cpu.cpp'],['../permute__pooled__embedding__ops__cpu_8cpp.html#a83bf468fc58e605fc64461726caad8cf',1,'FBGEMM_OP_DISPATCH(CPU, "permute_pooled_embs_auto_grad", fbgemm_gpu::permute_pooled_embs_auto_grad_cpu): permute_pooled_embedding_ops_cpu.cpp'],['../permute__pooled__embedding__ops__cpu_8cpp.html#a765ed01147edbd93b01e5f91fe12f68b',1,'FBGEMM_OP_DISPATCH(CPU, "permute_duplicate_pooled_embs", fbgemm_gpu::permute_duplicate_pooled_embs_cpu): permute_pooled_embedding_ops_cpu.cpp'],['../permute__pooled__embedding__ops__cpu_8cpp.html#aa0ac9a165fb46ae5738c08e0a887a97b',1,'FBGEMM_OP_DISPATCH(CPU, "permute_duplicate_pooled_embs_auto_grad", fbgemm_gpu::permute_duplicate_pooled_embs_auto_grad_cpu): permute_pooled_embedding_ops_cpu.cpp'],['../permute__pooled__embedding__ops__cpu_8cpp.html#a941e973d6b74e10046ae3373ba10bda2',1,'FBGEMM_OP_DISPATCH(Meta, "permute_pooled_embs", fbgemm_gpu::permute_pooled_embs_meta): permute_pooled_embedding_ops_cpu.cpp'],['../permute__pooled__embedding__ops__cpu_8cpp.html#a7590e07b38befcd57df567cb054cfad3',1,'FBGEMM_OP_DISPATCH(Meta, "permute_pooled_embs_auto_grad", fbgemm_gpu::permute_pooled_embs_auto_grad_meta): permute_pooled_embedding_ops_cpu.cpp'],['../permute__pooled__embedding__ops__cpu_8cpp.html#a858ecafbed2f155f42fe99391b82e4b4',1,'FBGEMM_OP_DISPATCH(Autograd, "permute_pooled_embs_auto_grad", fbgemm_gpu::permute_pooled_embs_auto_grad): permute_pooled_embedding_ops_cpu.cpp'],['../quantize__bfloat16_8cu.html#a44eca6a446116eaa006c5bd0488d62f2',1,'FBGEMM_OP_DISPATCH(CUDA, "Bfloat16QuantizedToFloat", fbgemm_gpu::_bfloat16_to_float_gpu): quantize_bfloat16.cu'],['../quantize__bfloat16_8cu.html#a4ed2eb1cae3301906c55dc98ee5ce687',1,'FBGEMM_OP_DISPATCH(CUDA, "FloatToBfloat16Quantized", fbgemm_gpu::_float_to_bfloat16_gpu): quantize_bfloat16.cu'],['../quantize__fused__8bit__rowwise_8cu.html#a360b78a6e199bcda032c8896708398db',1,'FBGEMM_OP_DISPATCH(CUDA, "FloatToFused8BitRowwiseQuantized", fbgemm_gpu::_float_to_fused8bitrowwise_gpu): quantize_fused_8bit_rowwise.cu'],['../quantize__fused__8bit__rowwise_8cu.html#afed513cf23a1957fa7f44309ed54288e',1,'FBGEMM_OP_DISPATCH(CUDA, "HalfToFused8BitRowwiseQuantized", fbgemm_gpu::_half_to_fused8bitrowwise_gpu): quantize_fused_8bit_rowwise.cu'],['../quantize__fused__8bit__rowwise_8cu.html#af35eb9fa075d341e379886496b6f2dad',1,'FBGEMM_OP_DISPATCH(CUDA, "FloatOrHalfToFused8BitRowwiseQuantized", fbgemm_gpu::_single_or_half_precision_to_fused8bitrowwise_gpu): quantize_fused_8bit_rowwise.cu'],['../quantize__fused__8bit__rowwise_8cu.html#ac2c5ae3ba26c4c71b5e42651752f6e05',1,'FBGEMM_OP_DISPATCH(CUDA, "Fused8BitRowwiseQuantizedToFloat", fbgemm_gpu::_fused8bitrowwise_to_float_gpu): quantize_fused_8bit_rowwise.cu'],['../quantize__fused__8bit__rowwise_8cu.html#a5ed3f01bedfeee57b88e3343ebab204a',1,'FBGEMM_OP_DISPATCH(CUDA, "Fused8BitRowwiseQuantizedToHalf", fbgemm_gpu::_fused8bitrowwise_to_half_gpu): quantize_fused_8bit_rowwise.cu'],['../quantize__fused__8bit__rowwise_8cu.html#ac5c42d23d15559e0fab4a67b274ac722',1,'FBGEMM_OP_DISPATCH(CUDA, "Fused8BitRowwiseQuantizedToFloatOrHalf", fbgemm_gpu::_fused8bitrowwise_to_single_or_half_precision_gpu): quantize_fused_8bit_rowwise.cu'],['../quantize__fused__8bit__rowwise_8cu.html#a36f61e129797f0efa0fa02acd3bf1628',1,'FBGEMM_OP_DISPATCH(CUDA, "Fused8BitRowwiseQuantizedToFloatMixedDim", fbgemm_gpu::_fused8bitrowwise_to_float_mixed_dim_gpu): quantize_fused_8bit_rowwise.cu'],['../quantize__fused__nbit__rowwise_8cu.html#ac0d21a1093187621384e9f7ee12af6f5',1,'FBGEMM_OP_DISPATCH(CUDA, "FloatToFusedNBitRowwiseQuantizedSBHalf", fbgemm_gpu::_float_to_fusednbitrowwise_gpu): quantize_fused_nbit_rowwise.cu'],['../quantize__fused__nbit__rowwise_8cu.html#acc803cc30f01a51dcba4d3e89471a836',1,'FBGEMM_OP_DISPATCH(CUDA, "HalfToFusedNBitRowwiseQuantizedSBHalf", fbgemm_gpu::_half_to_fusednbitrowwise_gpu): quantize_fused_nbit_rowwise.cu'],['../quantize__fused__nbit__rowwise_8cu.html#a9235db627f7b35c43f5a8baee9c6e73f',1,'FBGEMM_OP_DISPATCH(CUDA, "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf", fbgemm_gpu::_float_or_half_to_fusednbitrowwise_gpu): quantize_fused_nbit_rowwise.cu'],['../quantize__fused__nbit__rowwise_8cu.html#a04df767b706b47ca163b528c0ec49659',1,'FBGEMM_OP_DISPATCH(CUDA, "FusedNBitRowwiseQuantizedSBHalfToFloat", fbgemm_gpu::_fusednbitrowwise_to_float_gpu): quantize_fused_nbit_rowwise.cu'],['../quantize__fused__nbit__rowwise_8cu.html#ae8e33c20c4bfee06ceac1b42b87d40e0',1,'FBGEMM_OP_DISPATCH(CUDA, "FusedNBitRowwiseQuantizedSBHalfToHalf", fbgemm_gpu::_fusednbitrowwise_to_half_gpu): quantize_fused_nbit_rowwise.cu'],['../quantize__fused__nbit__rowwise_8cu.html#af782044b726c577b026de55ab1e37681',1,'FBGEMM_OP_DISPATCH(CUDA, "FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf", fbgemm_gpu::_fusednbitrowwise_to_float_or_half_gpu): quantize_fused_nbit_rowwise.cu'],['../quantize__hfp8_8cu.html#a38d08a293e27467dfdda3bb72ea27596',1,'FBGEMM_OP_DISPATCH(CUDA, "FloatToHFP8Quantized", fbgemm_gpu::_float_to_hfp8_gpu): quantize_hfp8.cu'],['../quantize__hfp8_8cu.html#a137d7c9cbf1612b410dd45b3bbebbea0',1,'FBGEMM_OP_DISPATCH(CUDA, "HFP8QuantizedToFloat", fbgemm_gpu::_hfp8_to_float_gpu): quantize_hfp8.cu'],['../quantize__msfp_8cu.html#abba68956be833439bf5ecabfe3880300',1,'FBGEMM_OP_DISPATCH(CUDA, "FloatToMSFPQuantized", fbgemm_gpu::_float_to_msfp_gpu): quantize_msfp.cu'],['../quantize__msfp_8cu.html#ace6d6f85efbdd32b7378b07a2e394166',1,'FBGEMM_OP_DISPATCH(CUDA, "MSFPQuantizedToFloat", fbgemm_gpu::_msfp_to_float_gpu): quantize_msfp.cu'],['../quantize__ops__gpu_8cpp.html#a0d298145c58d3db95b0838ab9e321626',1,'FBGEMM_OP_DISPATCH(CUDA, "FloatToFP8RowwiseQuantized", fbgemm_gpu::_float_to_FP8rowwise_gpu): quantize_ops_gpu.cpp'],['../quantize__ops__gpu_8cpp.html#a0ae0af8cb484307360d889119a25a870',1,'FBGEMM_OP_DISPATCH(CUDA, "FP8RowwiseQuantizedToFloat", fbgemm_gpu::_FP8rowwise_to_float_gpu): quantize_ops_gpu.cpp'],['../quantize__ops__gpu_8cpp.html#a6f70026edd736cca0ec96d6369571e06',1,'FBGEMM_OP_DISPATCH(CUDA, "FloatToPaddedFP8RowwiseQuantized", fbgemm_gpu::_float_to_paddedFP8rowwise_gpu): quantize_ops_gpu.cpp'],['../quantize__ops__gpu_8cpp.html#a18e52d6b9f96ae0c9f7552f54808d958',1,'FBGEMM_OP_DISPATCH(CUDA, "PaddedFP8RowwiseQuantizedToFloat", fbgemm_gpu::_paddedFP8rowwise_to_float_gpu): quantize_ops_gpu.cpp'],['../sparse__async__cumsum_8cu.html#a37ee97bf0cf5f3e51b626963d0905d5d',1,'FBGEMM_OP_DISPATCH(CUDA, "asynchronous_exclusive_cumsum", fbgemm_gpu::asynchronous_exclusive_cumsum_gpu): sparse_async_cumsum.cu'],['../sparse__async__cumsum_8cu.html#a956fe5a496592a618c66c5cdd7e76aee',1,'FBGEMM_OP_DISPATCH(CUDA, "asynchronous_complete_cumsum", fbgemm_gpu::asynchronous_complete_cumsum_gpu): sparse_async_cumsum.cu'],['../sparse__async__cumsum_8cu.html#a1fe1796f45f950ba568e1f5fb38d3da8',1,'FBGEMM_OP_DISPATCH(CUDA, "asynchronous_inclusive_cumsum", fbgemm_gpu::asynchronous_inclusive_cumsum_gpu): sparse_async_cumsum.cu'],['../sparse__block__bucketize__features_8cu.html#ac393348a81fe14a2734e4a221b3e028c',1,'FBGEMM_OP_DISPATCH(CUDA, "block_bucketize_sparse_features", fbgemm_gpu::block_bucketize_sparse_features_cuda): sparse_block_bucketize_features.cu'],['../sparse__bucketize__features_8cu.html#a9f5c60b5d418eded60f0c447ae38c450',1,'FBGEMM_OP_DISPATCH(CUDA, "bucketize_sparse_features", fbgemm_gpu::bucketize_sparse_features_cuda): sparse_bucketize_features.cu'],['../sparse__expand__into__jagged__permute_8cu.html#af4f7b3da9350e95957c452753c2569a7',1,'FBGEMM_OP_DISPATCH(CUDA, "expand_into_jagged_permute", fbgemm_gpu::expand_into_jagged_permute_cuda): sparse_expand_into_jagged_permute.cu'],['../sparse__invert__permute_8cu.html#a472cc598c3ed7832c2866f8aaed5fdc8',1,'FBGEMM_OP_DISPATCH(CUDA, "invert_permute", fbgemm_gpu::invert_permute_cuda): sparse_invert_permute.cu'],['../sparse__permute102_8cu.html#aa5a7770ccd8e2e72012a3035579d2cfc',1,'FBGEMM_OP_DISPATCH(CUDA, "permute102_baddbmm_permute102", fbgemm_gpu::permute102_baddbmm_permute102_cuda): sparse_permute102.cu'],['../sparse__permute__1d_8cu.html#aa28c2751b385fa3416aa12a3dd2cb039',1,'FBGEMM_OP_DISPATCH(CUDA, "permute_1D_sparse_data", fbgemm_gpu::permute_1D_sparse_data_cuda): sparse_permute_1d.cu'],['../sparse__permute__2d_8cu.html#ab884888820b4be2c942de1bf75211b2b',1,'FBGEMM_OP_DISPATCH(CUDA, "permute_sparse_data", fbgemm_gpu::permute_2D_sparse_data_cuda): sparse_permute_2d.cu'],['../sparse__permute__2d_8cu.html#aab7fc0ba2b46743531f3d2fe4392be84',1,'FBGEMM_OP_DISPATCH(CUDA, "permute_2D_sparse_data", fbgemm_gpu::permute_2D_sparse_data_cuda): sparse_permute_2d.cu'],['../sparse__permute__2d_8cu.html#a16728339b915be3a73e7bced8598849f',1,'FBGEMM_OP_DISPATCH(CUDA, "permute_sparse_features", fbgemm_gpu::permute_sparse_features_cuda): sparse_permute_2d.cu'],['../sparse__permute__embeddings_8cu.html#a2281b30913187261c5233174f3f9622c',1,'FBGEMM_OP_DISPATCH(CUDA, "permute_sequence_embeddings", fbgemm_gpu::permute_sequence_embeddings_cuda): sparse_permute_embeddings.cu'],['../sparse__range_8cu.html#a85fc3de0cb5d8acd0c760b984ff30f3b',1,'FBGEMM_OP_DISPATCH(CUDA, "offsets_range", fbgemm_gpu::offsets_range_cuda): sparse_range.cu'],['../sparse__range_8cu.html#a7a62f9a9f0e7b39a3331e3cee8be776e',1,'FBGEMM_OP_DISPATCH(CUDA, "lengths_range", fbgemm_gpu::lengths_range_cuda): sparse_range.cu'],['../sparse__segment__sum__csr_8cu.html#ae64cf20351791f453c8f3156ed01c224',1,'FBGEMM_OP_DISPATCH(CUDA, "segment_sum_csr", fbgemm_gpu::segment_sum_csr_cuda): sparse_segment_sum_csr.cu']]], + ['fbgemm_5ftensor_5faccessor_2eh_17',['fbgemm_tensor_accessor.h',['../fbgemm__tensor__accessor_8h.html',1,'']]], + ['fd_5fb_18',['fd_B',['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html#a2d1eb541d5bdde0bf935a46f15efb9f4',1,'fd_B: gen_batch_index_select_dim0_forward_kernel_small.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html#a488a7f217a1d4705fbcdce81e0a028b2',1,'fd_B: gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html#a6b3d6f3af7d65ed111be48db11a4cc24',1,'fd_B: gen_embedding_forward_split_unweighted_nobag_kernel_small.cu']]], + ['fd_5fnum_5fwarps_5fper_5flist_19',['fd_num_warps_per_list',['../namespacefbgemm__gpu.html#a8d2f3cd432a3bf2de49086fb33ef71cb',1,'fbgemm_gpu']]], + ['fd_5fnum_5fwarps_5fper_5ftable_20',['fd_num_warps_per_table',['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#a4a300401a48c1b4c0d98e372a4293da2',1,'fd_num_warps_per_table: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a91261c861a9c2e7ff7936dba196c034e',1,'fd_num_warps_per_table: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#a4a300401a48c1b4c0d98e372a4293da2',1,'fd_num_warps_per_table: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#a91261c861a9c2e7ff7936dba196c034e',1,'fd_num_warps_per_table: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a91261c861a9c2e7ff7936dba196c034e',1,'fd_num_warps_per_table: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#a91261c861a9c2e7ff7936dba196c034e',1,'fd_num_warps_per_table: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu']]], + ['fint32_21',['fint32',['../namespacefbgemm__gpu.html#a4783bbd9753251a335f9f8fa2dd97c8c',1,'fbgemm_gpu']]], + ['fixed_5fl_5fper_5fwarp_22',['fixed_L_per_warp',['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html#acac1f3391492ec3c4a8942ec48197027',1,'gen_batch_index_select_dim0_forward_kernel_small.cu']]], + ['fixeddivisor_23',['FixedDivisor',['../classfbgemm__gpu_1_1_fixed_divisor.html',1,'FixedDivisor'],['../classfbgemm__gpu_1_1_fixed_divisor.html#a80d1fd876167b0bbb2d6a7ebdaa97270',1,'fbgemm_gpu::FixedDivisor::FixedDivisor()']]], + ['float_24',['float',['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_batch_index_select_dim0_backward_kernel_warp.cu'],['../gen__batch__index__select__dim0__forward__kernel_8cu.html#ae44f656615f2dcbbfec55dc3f365b9e3',1,'float: gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html#ae44f656615f2dcbbfec55dc3f365b9e3',1,'float: gen_batch_index_select_dim0_forward_kernel_small.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_dense_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_dense_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_none_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_none_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#ae44f656615f2dcbbfec55dc3f365b9e3',1,'float: gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#ae44f656615f2dcbbfec55dc3f365b9e3',1,'float: gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html#ae44f656615f2dcbbfec55dc3f365b9e3',1,'float: gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#ae44f656615f2dcbbfec55dc3f365b9e3',1,'float: gen_embedding_forward_dense_weighted_kernel.cu'],['../namespacenbit.html#ae44f656615f2dcbbfec55dc3f365b9e3',1,'nbit::float'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#ae44f656615f2dcbbfec55dc3f365b9e3',1,'float: gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#ae44f656615f2dcbbfec55dc3f365b9e3',1,'float: gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html#ae44f656615f2dcbbfec55dc3f365b9e3',1,'float: gen_embedding_forward_split_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#ae44f656615f2dcbbfec55dc3f365b9e3',1,'float: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#ae44f656615f2dcbbfec55dc3f365b9e3',1,'float: gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#ae44f656615f2dcbbfec55dc3f365b9e3',1,'float: gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ae44f656615f2dcbbfec55dc3f365b9e3',1,'float: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#ae44f656615f2dcbbfec55dc3f365b9e3',1,'float: gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#a6df94b891e47f19e9fa76b529e49cdda',1,'float: gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['float16_5fmax_25',['float16_max',['../namespacefbgemm__gpu.html#acb046dd929c4c4190894087e0952b6ad',1,'fbgemm_gpu']]], + ['float16_5fmin_26',['float16_min',['../namespacefbgemm__gpu.html#aab696723995ed599860851113bfdae05',1,'fbgemm_gpu']]], + ['float1_5fmax_27',['float1_max',['../namespacefbgemm__gpu.html#a245cd4874d44db0533c14f1e5da13b0d',1,'fbgemm_gpu']]], + ['float1_5fmin_28',['float1_min',['../namespacefbgemm__gpu.html#a3ec9af370f9f9997a31175d653701b82',1,'fbgemm_gpu']]], + ['float2_5fmax_29',['float2_max',['../namespacefbgemm__gpu.html#a75186b0bdaba58d01566eec48d2f6602',1,'fbgemm_gpu']]], + ['float2_5fmin_30',['float2_min',['../namespacefbgemm__gpu.html#aa0397156c968ae38da1e433bfd50d3a3',1,'fbgemm_gpu']]], + ['float4_5fmax_31',['float4_max',['../namespacefbgemm__gpu.html#a7aaeb2b2ad68d85c51fb2b8697c70cc4',1,'fbgemm_gpu']]], + ['float4_5fmin_32',['float4_min',['../namespacefbgemm__gpu.html#adf07e886eabd113338425ed288c06a7b',1,'fbgemm_gpu']]], + ['float8_5fmax_33',['float8_max',['../namespacefbgemm__gpu.html#aa292f064d1126228ac0d10457722616c',1,'fbgemm_gpu']]], + ['float8_5fmin_34',['float8_min',['../namespacefbgemm__gpu.html#abca50cf5035e82d7992586eac7b744cf',1,'fbgemm_gpu']]], + ['float_5for_5fhalf_5fto_5ffused8bitrowwise_5fcpu_35',['float_or_half_to_fused8bitrowwise_cpu',['../group__quantize-data-cpu.html#ga06b7d2bf3fadaa9869555a64a6752ef7',1,'fbgemm_gpu']]], + ['float_5for_5fhalf_5fto_5ffusednbitrowwise_5fcpu_36',['float_or_half_to_fusednbitrowwise_cpu',['../namespacefbgemm__gpu.html#ae983a889f16302029fcc4e5fcd5ce34f',1,'fbgemm_gpu']]], + ['float_5fto_5ffp8rowwise_5fcpu_37',['float_to_FP8rowwise_cpu',['../group__quantize-data-cpu.html#gad540dd7f8ad7601b3d9591114e4ef718',1,'fbgemm_gpu']]], + ['float_5fto_5ffused8bitrowwise_5fcpu_38',['float_to_fused8bitrowwise_cpu',['../group__quantize-data-cpu.html#gacf598456fd7aced63b96e8a725f4c418',1,'fbgemm_gpu']]], + ['float_5fto_5ffusednbitrowwise_5fcpu_39',['float_to_fusednbitrowwise_cpu',['../namespacefbgemm__gpu.html#a9330d767d66b257d1ffa28c67775b38e',1,'fbgemm_gpu']]], + ['float_5fto_5fhfp8_40',['float_to_hfp8',['../namespacefbgemm__gpu.html#a9710845f2dffae8b40b17d49c169976b',1,'fbgemm_gpu']]], + ['float_5fto_5fsto_5fhalf_5fassemblefloat_41',['float_to_sto_half_assemblefloat',['../verify__fp16__stochastic__benchmark_8cu.html#afb0f683c8db4e3b5d5fd504735c60b25',1,'verify_fp16_stochastic_benchmark.cu']]], + ['float_5fto_5fsto_5fhalf_5fbitcarry_42',['float_to_sto_half_bitcarry',['../verify__fp16__stochastic__benchmark_8cu.html#a0fa16f5c4aa1d84c03f25daeb10e9422',1,'verify_fp16_stochastic_benchmark.cu']]], + ['float_5fto_5fsto_5fhalf_5fdirect_43',['float_to_sto_half_direct',['../verify__fp16__stochastic__benchmark_8cu.html#af0a4d95d246fb468f1b26eace73794f3',1,'verify_fp16_stochastic_benchmark.cu']]], + ['float_5fto_5fsto_5fhalf_5fshortrand_44',['float_to_sto_half_shortrand',['../verify__fp16__stochastic__benchmark_8cu.html#aecab575916373f334a644238b6e02cf2',1,'verify_fp16_stochastic_benchmark.cu']]], + ['floattobfloat16quantized_5fref_45',['FloatToBFloat16Quantized_ref',['../namespacefbgemm__gpu.html#a46f430eb3d28bcd3fed6fbc61dec3bda',1,'fbgemm_gpu']]], + ['floattofp8quantized_5fref_46',['FloatToFP8Quantized_ref',['../group__quantize-data-cpu.html#gad14f49d191c7960681206b7103d781c4',1,'fbgemm_gpu']]], + ['floattofp8rowwisequantized_5fmeta_47',['FloatToFP8RowwiseQuantized_meta',['../namespacefbgemm__gpu.html#a5a525ef518134e136f23ab964d45dc23',1,'fbgemm_gpu']]], + ['flush_48',['flush',['../classssd_1_1_embedding_rocks_d_b.html#adac116554b543b7c4228c018a85882f5',1,'ssd::EmbeddingRocksDB']]], + ['flush_5fcache_49',['flush_cache',['../verify__fp16__stochastic__benchmark_8cu.html#a65d8faf79602cb52dbf1c3dc90db0cbd',1,'flush_cache(std::vector< char > flush, char *d_flush, char *d_flush2, int cache_size, bool do_write=false): verify_fp16_stochastic_benchmark.cu'],['../bench__utils_8cuh.html#a7fcbe2b8cc9b7676bb24b328fd41dc3a',1,'flush_cache(int cache_size_mb=40, bool do_write=false): bench_utils.cuh']]], + ['flush_5fgpu_50',['flush_gpu',['../verify__fp16__stochastic__benchmark_8cu.html#ab211bd95de3d67a08c95c5d7f070dfcb',1,'verify_fp16_stochastic_benchmark.cu']]], + ['flush_5fif_5fnecessary_51',['flush_if_necessary',['../classssd_1_1_embedding_rocks_d_b.html#a5e5bb9c575c52445a77bd0c39afc50bb',1,'ssd::EmbeddingRocksDB']]], + ['fma_52',['fma',['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#ad5c1e8194ecc27d73fb5477bc6795df8',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::fma()'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#af82504393e0e09a157a40980598f626b',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::fma()'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#ad3b821b9b1862e7970a798dcc105dce8',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::fma()'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#a3198b30904d0e23bf46c12eabf628e9e',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::fma()'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#a3198b30904d0e23bf46c12eabf628e9e',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::fma()'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#a3198b30904d0e23bf46c12eabf628e9e',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::fma()'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#a5779758db0a3dea1eb734fb1cbf9670d',1,'fbgemm_gpu::Vec4AccT::fma(const float4 *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#ad0817540a257625fecb7890a0ed2533c',1,'fbgemm_gpu::Vec4AccT::fma(const float2 *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#acf03f270b01757bf3c12309e398fc663',1,'fbgemm_gpu::Vec4AccT::fma(const uint8_t *ptr, const float weight)'],['../embedding__forward__split__kernel__v2__template_8cu.html#a2a539cccb1f62bb145cef234b6608c7f',1,'fma(): embedding_forward_split_kernel_v2_template.cu']]], + ['fma_5f_53',['fma_',['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#af7ca249b197579ed0c1e65179d406b92',1,'fbgemm_gpu::Vec4T< float >::fma_()'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a5914148b281516a23c9786a11d6675ad',1,'fbgemm_gpu::Vec4T< at::Half >::fma_(const Vec4T< at::Half > &a, const float b)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#af7ca249b197579ed0c1e65179d406b92',1,'fbgemm_gpu::Vec4T< at::Half >::fma_(const Vec4T< float > &a, const float b)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a5914148b281516a23c9786a11d6675ad',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::fma_(const Vec4T< at::Half > &a, const float b)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#af7ca249b197579ed0c1e65179d406b92',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::fma_(const Vec4T< float > &a, const float b)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a935586b35f2e7d90ec234784a8a5d2b8',1,'fbgemm_gpu::Vec4T< double >::fma_()'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#ad1ed20d954c2af00a7af0011bb652f42',1,'fbgemm_gpu::Vec4AccT::fma_(const float *vals, const float weight)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#abe8fde8cd9a20ff924fd33e7d16eaa42',1,'fbgemm_gpu::Vec4AccT::fma_(const half *vals, const float weight)']]], + ['for_54',['for',['../namespacefbgemm__gpu.html#af2287d510f303567f2d28d743aa716b6',1,'fbgemm_gpu']]], + ['forward_55',['forward',['../classfbgemm__gpu_1_1_permute_pooled_embs_function.html#a278e4d6a68c0e694370831a0d04a2918',1,'fbgemm_gpu::PermutePooledEmbsFunction::forward()'],['../classfbgemm__gpu_1_1_permute_pooled_embs_function_split.html#a83e4292464a5708945ca80a1f2171a27',1,'fbgemm_gpu::PermutePooledEmbsFunctionSplit::forward()']]], + ['fp_56',['FP',['../namespacefbgemm__gpu.html#aa7e45742197542f659233c21b883ba60a4ebada6a2af2bcba53ded1d7b414f081',1,'fbgemm_gpu']]], + ['fp16_57',['FP16',['../namespacefbgemm__gpu.html#a47b4476e5f749d63e15d2f8e55be833eaa4bf99d6945c25077fd6660d536af8a0',1,'fbgemm_gpu']]], + ['fp32_58',['FP32',['../namespacefbgemm__gpu.html#a47b4476e5f749d63e15d2f8e55be833ea693aa0bef84c25fe81c7e62e72f9313d',1,'fbgemm_gpu']]], + ['fp8_59',['FP8',['../namespacefbgemm__gpu.html#a47b4476e5f749d63e15d2f8e55be833eae32efd813b88548940f8718a61864cf5',1,'fbgemm_gpu']]], + ['fp8quantizedtofloat_5fref_60',['FP8QuantizedToFloat_ref',['../group__quantize-data-cpu.html#ga4c49e527f364bfa224ed34f4fe9f13e7',1,'fbgemm_gpu']]], + ['fp8rowwise_5fto_5ffloat_5fcpu_61',['FP8rowwise_to_float_cpu',['../group__quantize-data-cpu.html#ga1d3b2f7c37e8755516ff8a4c504017e1',1,'fbgemm_gpu']]], + ['fp8rowwise_5fto_5ffloat_5fmeta_62',['FP8rowwise_to_float_meta',['../namespacefbgemm__gpu.html#ae7fdacc8f9e0ec9e1ede8102876ab537',1,'fbgemm_gpu']]], + ['func_5fname_5f_63',['func_name_',['../classfbgemm__gpu_1_1_tensor_accessor_base.html#a6bf2b75bf9dc4183f00948671a67e498',1,'fbgemm_gpu::TensorAccessorBase::func_name_'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#a1a08cc832507584680a6266e8c20c52f',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::func_name_']]], + ['fused8bitrowwise_5fto_5ffloat_5fcpu_64',['fused8bitrowwise_to_float_cpu',['../group__quantize-data-cpu.html#gab86a824fed15fab1c318359d069a5180',1,'fbgemm_gpu']]], + ['fused8bitrowwise_5fto_5ffloat_5for_5fhalf_5fcpu_65',['fused8bitrowwise_to_float_or_half_cpu',['../group__quantize-data-cpu.html#gad219617d0aa308f97fad8dfc6af20213',1,'fbgemm_gpu']]], + ['fused8bitrowwise_5fto_5fhalf_5fcpu_66',['fused8bitrowwise_to_half_cpu',['../group__quantize-data-cpu.html#ga9284d774f5d4087da98453e96e64d00a',1,'fbgemm_gpu']]], + ['fused8bitrowwise_5fto_5fhalf_5fcpu_5fout_67',['fused8bitrowwise_to_half_cpu_out',['../namespacefbgemm__gpu.html#a389ed2b83ea0f408fe19fbb46770c610',1,'fbgemm_gpu']]], + ['fusednbitrowwise_5fto_5ffloat_5fcpu_68',['fusednbitrowwise_to_float_cpu',['../group__quantize-data-cpu.html#ga61c494baf4e410652ed897534d14aa29',1,'fbgemm_gpu']]], + ['fusednbitrowwise_5fto_5ffloat_5for_5fhalf_5fcpu_69',['fusednbitrowwise_to_float_or_half_cpu',['../group__quantize-data-cpu.html#ga5bd66d69876ef2493a6ebb4346c31bb9',1,'fbgemm_gpu']]], + ['fusednbitrowwise_5fto_5fhalf_5fcpu_70',['fusednbitrowwise_to_half_cpu',['../group__quantize-data-cpu.html#ga1c32bf52a02928dbc573b4ac67065788',1,'fbgemm_gpu']]] ]; diff --git a/search/all_7.js b/search/all_7.js index e0908b210..1a1bf0ef6 100644 --- a/search/all_7.js +++ b/search/all_7.js @@ -1,6 +1,301 @@ var searchData= [ - ['input_20operators_0',['Combine Input Operators',['../group__input-combine.html',1,'']]], - ['int_5fnbit_5fsplit_5fembedding_5fuvm_5fcaching_5fcodegen_5flookup_5ffunction_1',['int_nbit_split_embedding_uvm_caching_codegen_lookup_function',['../group__embedding-cuda.html#gabbe880100f1036a979f3a8d8755447d0',1,'embedding_forward_quantized_host.cpp']]], - ['is_5fuvm_5ftensor_2',['is_uvm_tensor',['../group__cumem-utils.html#ga0b9f28b07d3796a732b1fb73b8e10e7e',1,'fbgemm_gpu']]] + ['gauss_0',['gauss',['../structfbgemm__gpu_1_1rk__state.html#a84e948a0aa303456e29ddecfac6a1e46',1,'fbgemm_gpu::rk_state']]], + ['gen_5f8bit_5frandom_1',['gen_8bit_random',['../verify__fp16__stochastic__benchmark_8cu.html#aa292d49c7c13666d79ff4c646b5284f0',1,'verify_fp16_stochastic_benchmark.cu']]], + ['gen_5fbatch_5findex_5fselect_5fdim0_5fbackward_5fcodegen_5fcuda_2ecu_2',['gen_batch_index_select_dim0_backward_codegen_cuda.cu',['../gen__batch__index__select__dim0__backward__codegen__cuda_8cu.html',1,'']]], + ['gen_5fbatch_5findex_5fselect_5fdim0_5fbackward_5fkernel_5fcta_2ecu_3',['gen_batch_index_select_dim0_backward_kernel_cta.cu',['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html',1,'']]], + ['gen_5fbatch_5findex_5fselect_5fdim0_5fbackward_5fkernel_5fwarp_2ecu_4',['gen_batch_index_select_dim0_backward_kernel_warp.cu',['../gen__batch__index__select__dim0__backward__kernel__warp_8cu.html',1,'']]], + ['gen_5fbatch_5findex_5fselect_5fdim0_5fforward_5fcodegen_5fcuda_2ecu_5',['gen_batch_index_select_dim0_forward_codegen_cuda.cu',['../gen__batch__index__select__dim0__forward__codegen__cuda_8cu.html',1,'']]], + ['gen_5fbatch_5findex_5fselect_5fdim0_5fforward_5fkernel_2ecu_6',['gen_batch_index_select_dim0_forward_kernel.cu',['../gen__batch__index__select__dim0__forward__kernel_8cu.html',1,'']]], + ['gen_5fbatch_5findex_5fselect_5fdim0_5fforward_5fkernel_5fsmall_2ecu_7',['gen_batch_index_select_dim0_forward_kernel_small.cu',['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html',1,'']]], + ['gen_5fdata_8',['gen_data',['../verify__fp16__stochastic__benchmark_8cu.html#ab5c51c16cea74c9decd6a2c957b515d9',1,'verify_fp16_stochastic_benchmark.cu']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5fcpu_2ecpp_9',['gen_embedding_backward_adagrad_split_cpu.cpp',['../gen__embedding__backward__adagrad__split__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5funweighted_5fcuda_2ecu_10',['gen_embedding_backward_adagrad_split_unweighted_cuda.cu',['../gen__embedding__backward__adagrad__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5funweighted_5fkernel_5fcta_2ecu_11',['gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_12',['gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_13',['gen_embedding_backward_adagrad_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__adagrad__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_14',['gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_15',['gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5fweighted_5fcuda_2ecu_16',['gen_embedding_backward_adagrad_split_weighted_cuda.cu',['../gen__embedding__backward__adagrad__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5fweighted_5fkernel_5fcta_2ecu_17',['gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu',['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_18',['gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu',['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadam_5fsplit_5funweighted_5fcuda_2ecu_19',['gen_embedding_backward_adam_split_unweighted_cuda.cu',['../gen__embedding__backward__adam__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadam_5fsplit_5funweighted_5fkernel_5fcta_2ecu_20',['gen_embedding_backward_adam_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadam_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_21',['gen_embedding_backward_adam_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadam_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_22',['gen_embedding_backward_adam_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__adam__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadam_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_23',['gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadam_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_24',['gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadam_5fsplit_5fweighted_5fcuda_2ecu_25',['gen_embedding_backward_adam_split_weighted_cuda.cu',['../gen__embedding__backward__adam__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadam_5fsplit_5fweighted_5fkernel_5fcta_2ecu_26',['gen_embedding_backward_adam_split_weighted_kernel_cta.cu',['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadam_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_27',['gen_embedding_backward_adam_split_weighted_kernel_warp.cu',['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fcuda_2ecu_28',['gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fkernel_5fcta_2ecu_29',['gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_30',['gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_31',['gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_32',['gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_33',['gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5fweighted_5fcuda_2ecu_34',['gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5fweighted_5fkernel_5fcta_2ecu_35',['gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_36',['gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5findice_5fweights_5fcodegen_5fcuda_2ecu_37',['gen_embedding_backward_dense_indice_weights_codegen_cuda.cu',['../gen__embedding__backward__dense__indice__weights__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5fcpu_2ecpp_38',['gen_embedding_backward_dense_split_cpu.cpp',['../gen__embedding__backward__dense__split__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5funweighted_5fcuda_2ecu_39',['gen_embedding_backward_dense_split_unweighted_cuda.cu',['../gen__embedding__backward__dense__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5funweighted_5fkernel_5fcta_2ecu_40',['gen_embedding_backward_dense_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_41',['gen_embedding_backward_dense_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__dense__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_42',['gen_embedding_backward_dense_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__dense__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_43',['gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_44',['gen_embedding_backward_dense_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5fweighted_5fcuda_2ecu_45',['gen_embedding_backward_dense_split_weighted_cuda.cu',['../gen__embedding__backward__dense__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5fweighted_5fkernel_5fcta_2ecu_46',['gen_embedding_backward_dense_split_weighted_kernel_cta.cu',['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_47',['gen_embedding_backward_dense_split_weighted_kernel_warp.cu',['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flamb_5fsplit_5funweighted_5fcuda_2ecu_48',['gen_embedding_backward_lamb_split_unweighted_cuda.cu',['../gen__embedding__backward__lamb__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flamb_5fsplit_5funweighted_5fkernel_5fcta_2ecu_49',['gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flamb_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_50',['gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flamb_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_51',['gen_embedding_backward_lamb_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__lamb__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flamb_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_52',['gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flamb_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_53',['gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flamb_5fsplit_5fweighted_5fcuda_2ecu_54',['gen_embedding_backward_lamb_split_weighted_cuda.cu',['../gen__embedding__backward__lamb__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flamb_5fsplit_5fweighted_5fkernel_5fcta_2ecu_55',['gen_embedding_backward_lamb_split_weighted_kernel_cta.cu',['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flamb_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_56',['gen_embedding_backward_lamb_split_weighted_kernel_warp.cu',['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flars_5fsgd_5fsplit_5funweighted_5fcuda_2ecu_57',['gen_embedding_backward_lars_sgd_split_unweighted_cuda.cu',['../gen__embedding__backward__lars__sgd__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flars_5fsgd_5fsplit_5funweighted_5fkernel_5fcta_2ecu_58',['gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flars_5fsgd_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_59',['gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flars_5fsgd_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_60',['gen_embedding_backward_lars_sgd_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flars_5fsgd_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_61',['gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flars_5fsgd_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_62',['gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flars_5fsgd_5fsplit_5fweighted_5fcuda_2ecu_63',['gen_embedding_backward_lars_sgd_split_weighted_cuda.cu',['../gen__embedding__backward__lars__sgd__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flars_5fsgd_5fsplit_5fweighted_5fkernel_5fcta_2ecu_64',['gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu',['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flars_5fsgd_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_65',['gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu',['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fnone_5fsplit_5funweighted_5fcuda_2ecu_66',['gen_embedding_backward_none_split_unweighted_cuda.cu',['../gen__embedding__backward__none__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fnone_5fsplit_5funweighted_5fkernel_5fcta_2ecu_67',['gen_embedding_backward_none_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fnone_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_68',['gen_embedding_backward_none_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__none__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fnone_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_69',['gen_embedding_backward_none_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__none__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fnone_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_70',['gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fnone_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_71',['gen_embedding_backward_none_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__none__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fnone_5fsplit_5fweighted_5fcuda_2ecu_72',['gen_embedding_backward_none_split_weighted_cuda.cu',['../gen__embedding__backward__none__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fnone_5fsplit_5fweighted_5fkernel_5fcta_2ecu_73',['gen_embedding_backward_none_split_weighted_kernel_cta.cu',['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fnone_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_74',['gen_embedding_backward_none_split_weighted_kernel_warp.cu',['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5fadam_5fsplit_5funweighted_5fcuda_2ecu_75',['gen_embedding_backward_partial_rowwise_adam_split_unweighted_cuda.cu',['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5fadam_5fsplit_5funweighted_5fkernel_5fcta_2ecu_76',['gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5fadam_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_77',['gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5fadam_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_78',['gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5fadam_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_79',['gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5fadam_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_80',['gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5fadam_5fsplit_5fweighted_5fcuda_2ecu_81',['gen_embedding_backward_partial_rowwise_adam_split_weighted_cuda.cu',['../gen__embedding__backward__partial__rowwise__adam__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5fadam_5fsplit_5fweighted_5fkernel_5fcta_2ecu_82',['gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu',['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5fadam_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_83',['gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu',['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5flamb_5fsplit_5funweighted_5fcuda_2ecu_84',['gen_embedding_backward_partial_rowwise_lamb_split_unweighted_cuda.cu',['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5flamb_5fsplit_5funweighted_5fkernel_5fcta_2ecu_85',['gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5flamb_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_86',['gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5flamb_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_87',['gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5flamb_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_88',['gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5flamb_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_89',['gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5flamb_5fsplit_5fweighted_5fcuda_2ecu_90',['gen_embedding_backward_partial_rowwise_lamb_split_weighted_cuda.cu',['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5flamb_5fsplit_5fweighted_5fkernel_5fcta_2ecu_91',['gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu',['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5flamb_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_92',['gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu',['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5fcpu_2ecpp_93',['gen_embedding_backward_rowwise_adagrad_split_cpu.cpp',['../gen__embedding__backward__rowwise__adagrad__split__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5funweighted_5fcuda_2ecu_94',['gen_embedding_backward_rowwise_adagrad_split_unweighted_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5funweighted_5fkernel_5fcta_2ecu_95',['gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_96',['gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_97',['gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_98',['gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_99',['gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5funweighted_5fvbe_5fcuda_2ecu_100',['gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5funweighted_5fvbe_5fkernel_5fcta_2ecu_101',['gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5funweighted_5fvbe_5fkernel_5fwarp_2ecu_102',['gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5fweighted_5fcuda_2ecu_103',['gen_embedding_backward_rowwise_adagrad_split_weighted_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5fweighted_5fkernel_5fcta_2ecu_104',['gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_105',['gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5fweighted_5fvbe_5fcuda_2ecu_106',['gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5fweighted_5fvbe_5fkernel_5fcta_2ecu_107',['gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5fweighted_5fvbe_5fkernel_5fwarp_2ecu_108',['gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5fcpu_2ecpp_109',['gen_embedding_backward_rowwise_adagrad_with_counter_split_cpu.cpp',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5funweighted_5fcuda_2ecu_110',['gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5funweighted_5fkernel_5fcta_2ecu_111',['gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_112',['gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_113',['gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_114',['gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_115',['gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5fweighted_5fcuda_2ecu_116',['gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5fweighted_5fkernel_5fcta_2ecu_117',['gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_118',['gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fcuda_2ecu_119',['gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fkernel_5fcta_2ecu_120',['gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_121',['gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_122',['gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_123',['gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_124',['gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5fweighted_5fcuda_2ecu_125',['gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5fweighted_5fkernel_5fcta_2ecu_126',['gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_127',['gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5fcpu_2ecpp_128',['gen_embedding_backward_rowwise_weighted_adagrad_split_cpu.cpp',['../gen__embedding__backward__rowwise__weighted__adagrad__split__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5funweighted_5fcuda_2ecu_129',['gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_cuda.cu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5funweighted_5fkernel_5fcta_2ecu_130',['gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_131',['gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_132',['gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_133',['gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_134',['gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5fweighted_5fcuda_2ecu_135',['gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_cuda.cu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5fweighted_5fkernel_5fcta_2ecu_136',['gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_137',['gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5fcpu_2ecpp_138',['gen_embedding_backward_sgd_split_cpu.cpp',['../gen__embedding__backward__sgd__split__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5funweighted_5fcuda_2ecu_139',['gen_embedding_backward_sgd_split_unweighted_cuda.cu',['../gen__embedding__backward__sgd__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5funweighted_5fkernel_5fcta_2ecu_140',['gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_141',['gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_142',['gen_embedding_backward_sgd_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__sgd__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_143',['gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_144',['gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5funweighted_5fvbe_5fcuda_2ecu_145',['gen_embedding_backward_sgd_split_unweighted_vbe_cuda.cu',['../gen__embedding__backward__sgd__split__unweighted__vbe__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5funweighted_5fvbe_5fkernel_5fcta_2ecu_146',['gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu',['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5funweighted_5fvbe_5fkernel_5fwarp_2ecu_147',['gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu',['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5fweighted_5fcuda_2ecu_148',['gen_embedding_backward_sgd_split_weighted_cuda.cu',['../gen__embedding__backward__sgd__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5fweighted_5fkernel_5fcta_2ecu_149',['gen_embedding_backward_sgd_split_weighted_kernel_cta.cu',['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_150',['gen_embedding_backward_sgd_split_weighted_kernel_warp.cu',['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5fweighted_5fvbe_5fcuda_2ecu_151',['gen_embedding_backward_sgd_split_weighted_vbe_cuda.cu',['../gen__embedding__backward__sgd__split__weighted__vbe__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5fweighted_5fvbe_5fkernel_5fcta_2ecu_152',['gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu',['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5fweighted_5fvbe_5fkernel_5fwarp_2ecu_153',['gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu',['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fadagrad_2ecpp_154',['gen_embedding_backward_split_adagrad.cpp',['../gen__embedding__backward__split__adagrad_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fadagrad_5fcpu_2ecpp_155',['gen_embedding_backward_split_adagrad_cpu.cpp',['../gen__embedding__backward__split__adagrad__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fadam_2ecpp_156',['gen_embedding_backward_split_adam.cpp',['../gen__embedding__backward__split__adam_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fadam_5fcpu_2ecpp_157',['gen_embedding_backward_split_adam_cpu.cpp',['../gen__embedding__backward__split__adam__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fapprox_5frowwise_5fadagrad_2ecpp_158',['gen_embedding_backward_split_approx_rowwise_adagrad.cpp',['../gen__embedding__backward__split__approx__rowwise__adagrad_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fapprox_5frowwise_5fadagrad_5fcpu_2ecpp_159',['gen_embedding_backward_split_approx_rowwise_adagrad_cpu.cpp',['../gen__embedding__backward__split__approx__rowwise__adagrad__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fapprox_5frowwise_5fadagrad_5fwith_5fcounter_2ecpp_160',['gen_embedding_backward_split_approx_rowwise_adagrad_with_counter.cpp',['../gen__embedding__backward__split__approx__rowwise__adagrad__with__counter_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fapprox_5frowwise_5fadagrad_5fwith_5fcounter_5fcpu_2ecpp_161',['gen_embedding_backward_split_approx_rowwise_adagrad_with_counter_cpu.cpp',['../gen__embedding__backward__split__approx__rowwise__adagrad__with__counter__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_2ecpp_162',['gen_embedding_backward_split_approx_rowwise_adagrad_with_weight_decay.cpp',['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fcpu_2ecpp_163',['gen_embedding_backward_split_approx_rowwise_adagrad_with_weight_decay_cpu.cpp',['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fapprox_5fsgd_2ecpp_164',['gen_embedding_backward_split_approx_sgd.cpp',['../gen__embedding__backward__split__approx__sgd_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fapprox_5fsgd_5fcpu_2ecpp_165',['gen_embedding_backward_split_approx_sgd_cpu.cpp',['../gen__embedding__backward__split__approx__sgd__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fgrad_2ecu_166',['gen_embedding_backward_split_grad.cu',['../gen__embedding__backward__split__grad_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5findice_5fweights_5fcodegen_5fcuda_2ecu_167',['gen_embedding_backward_split_indice_weights_codegen_cuda.cu',['../gen__embedding__backward__split__indice__weights__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5flamb_2ecpp_168',['gen_embedding_backward_split_lamb.cpp',['../gen__embedding__backward__split__lamb_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5flamb_5fcpu_2ecpp_169',['gen_embedding_backward_split_lamb_cpu.cpp',['../gen__embedding__backward__split__lamb__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5flars_5fsgd_2ecpp_170',['gen_embedding_backward_split_lars_sgd.cpp',['../gen__embedding__backward__split__lars__sgd_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5flars_5fsgd_5fcpu_2ecpp_171',['gen_embedding_backward_split_lars_sgd_cpu.cpp',['../gen__embedding__backward__split__lars__sgd__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fnone_2ecpp_172',['gen_embedding_backward_split_none.cpp',['../gen__embedding__backward__split__none_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fnone_5fcpu_2ecpp_173',['gen_embedding_backward_split_none_cpu.cpp',['../gen__embedding__backward__split__none__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fpartial_5frowwise_5fadam_2ecpp_174',['gen_embedding_backward_split_partial_rowwise_adam.cpp',['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fpartial_5frowwise_5fadam_5fcpu_2ecpp_175',['gen_embedding_backward_split_partial_rowwise_adam_cpu.cpp',['../gen__embedding__backward__split__partial__rowwise__adam__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fpartial_5frowwise_5flamb_2ecpp_176',['gen_embedding_backward_split_partial_rowwise_lamb.cpp',['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fpartial_5frowwise_5flamb_5fcpu_2ecpp_177',['gen_embedding_backward_split_partial_rowwise_lamb_cpu.cpp',['../gen__embedding__backward__split__partial__rowwise__lamb__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5frowwise_5fadagrad_2ecpp_178',['gen_embedding_backward_split_rowwise_adagrad.cpp',['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5frowwise_5fadagrad_5fcpu_2ecpp_179',['gen_embedding_backward_split_rowwise_adagrad_cpu.cpp',['../gen__embedding__backward__split__rowwise__adagrad__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5frowwise_5fadagrad_5fwith_5fcounter_2ecpp_180',['gen_embedding_backward_split_rowwise_adagrad_with_counter.cpp',['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5frowwise_5fadagrad_5fwith_5fcounter_5fcpu_2ecpp_181',['gen_embedding_backward_split_rowwise_adagrad_with_counter_cpu.cpp',['../gen__embedding__backward__split__rowwise__adagrad__with__counter__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_2ecpp_182',['gen_embedding_backward_split_rowwise_adagrad_with_weight_decay.cpp',['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fcpu_2ecpp_183',['gen_embedding_backward_split_rowwise_adagrad_with_weight_decay_cpu.cpp',['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5frowwise_5fweighted_5fadagrad_2ecpp_184',['gen_embedding_backward_split_rowwise_weighted_adagrad.cpp',['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5frowwise_5fweighted_5fadagrad_5fcpu_2ecpp_185',['gen_embedding_backward_split_rowwise_weighted_adagrad_cpu.cpp',['../gen__embedding__backward__split__rowwise__weighted__adagrad__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fsgd_2ecpp_186',['gen_embedding_backward_split_sgd.cpp',['../gen__embedding__backward__split__sgd_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fsgd_5fcpu_2ecpp_187',['gen_embedding_backward_split_sgd_cpu.cpp',['../gen__embedding__backward__split__sgd__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fforward_5fdense_5funweighted_5fcodegen_5fcuda_2ecu_188',['gen_embedding_forward_dense_unweighted_codegen_cuda.cu',['../gen__embedding__forward__dense__unweighted__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fdense_5funweighted_5fcodegen_5fmeta_2ecpp_189',['gen_embedding_forward_dense_unweighted_codegen_meta.cpp',['../gen__embedding__forward__dense__unweighted__codegen__meta_8cpp.html',1,'']]], + ['gen_5fembedding_5fforward_5fdense_5funweighted_5fkernel_2ecu_190',['gen_embedding_forward_dense_unweighted_kernel.cu',['../gen__embedding__forward__dense__unweighted__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fdense_5funweighted_5fnobag_5fkernel_2ecu_191',['gen_embedding_forward_dense_unweighted_nobag_kernel.cu',['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fdense_5funweighted_5fnobag_5fkernel_5fsmall_2ecu_192',['gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu',['../gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fdense_5fweighted_5fcodegen_5fcuda_2ecu_193',['gen_embedding_forward_dense_weighted_codegen_cuda.cu',['../gen__embedding__forward__dense__weighted__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fdense_5fweighted_5fcodegen_5fmeta_2ecpp_194',['gen_embedding_forward_dense_weighted_codegen_meta.cpp',['../gen__embedding__forward__dense__weighted__codegen__meta_8cpp.html',1,'']]], + ['gen_5fembedding_5fforward_5fdense_5fweighted_5fkernel_2ecu_195',['gen_embedding_forward_dense_weighted_kernel.cu',['../gen__embedding__forward__dense__weighted__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fhost_5funweighted_5fcodegen_5fcuda_2ecu_196',['gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fhost_5funweighted_5fnobag_5fcodegen_5fcuda_2ecu_197',['gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fhost_5fweighted_5fcodegen_5fcuda_2ecu_198',['gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5ffp16_5fcodegen_5fcuda_2ecu_199',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_fp16_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__fp16__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5ffp32_5fcodegen_5fcuda_2ecu_200',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_fp32_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__fp32__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5ffp8_5fcodegen_5fcuda_2ecu_201',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_fp8_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__fp8__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5fint2_5fcodegen_5fcuda_2ecu_202',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_int2_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__int2__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5fint4_5fcodegen_5fcuda_2ecu_203',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_int4_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__int4__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5fint8_5fcodegen_5fcuda_2ecu_204',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_int8_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__int8__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5fnobag_5ffp16_5fcodegen_5fcuda_2ecu_205',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_fp16_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__fp16__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5fnobag_5ffp32_5fcodegen_5fcuda_2ecu_206',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_fp32_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__fp32__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5fnobag_5ffp8_5fcodegen_5fcuda_2ecu_207',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_fp8_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__fp8__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5fnobag_5fint2_5fcodegen_5fcuda_2ecu_208',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_int2_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__int2__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5fnobag_5fint4_5fcodegen_5fcuda_2ecu_209',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_int4_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__int4__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5fnobag_5fint8_5fcodegen_5fcuda_2ecu_210',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_int8_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__int8__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5fweighted_5ffp16_5fcodegen_5fcuda_2ecu_211',['gen_embedding_forward_quantized_split_nbit_kernel_weighted_fp16_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__weighted__fp16__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5fweighted_5ffp32_5fcodegen_5fcuda_2ecu_212',['gen_embedding_forward_quantized_split_nbit_kernel_weighted_fp32_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__weighted__fp32__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5fweighted_5ffp8_5fcodegen_5fcuda_2ecu_213',['gen_embedding_forward_quantized_split_nbit_kernel_weighted_fp8_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__weighted__fp8__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5fweighted_5fint2_5fcodegen_5fcuda_2ecu_214',['gen_embedding_forward_quantized_split_nbit_kernel_weighted_int2_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__weighted__int2__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5fweighted_5fint4_5fcodegen_5fcuda_2ecu_215',['gen_embedding_forward_quantized_split_nbit_kernel_weighted_int4_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__weighted__int4__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5fweighted_5fint8_5fcodegen_5fcuda_2ecu_216',['gen_embedding_forward_quantized_split_nbit_kernel_weighted_int8_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__weighted__int8__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5funweighted_5fcodegen_5fcpu_2ecpp_217',['gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp',['../gen__embedding__forward__quantized__unweighted__codegen__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fweighted_5fcodegen_5fcpu_2ecpp_218',['gen_embedding_forward_quantized_weighted_codegen_cpu.cpp',['../gen__embedding__forward__quantized__weighted__codegen__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5funweighted_5fcodegen_5fcuda_2ecu_219',['gen_embedding_forward_split_unweighted_codegen_cuda.cu',['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5funweighted_5fcodegen_5fmeta_2ecpp_220',['gen_embedding_forward_split_unweighted_codegen_meta.cpp',['../gen__embedding__forward__split__unweighted__codegen__meta_8cpp.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5funweighted_5fkernel_2ecu_221',['gen_embedding_forward_split_unweighted_kernel.cu',['../gen__embedding__forward__split__unweighted__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5funweighted_5fnobag_5fkernel_2ecu_222',['gen_embedding_forward_split_unweighted_nobag_kernel.cu',['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5funweighted_5fnobag_5fkernel_5fsmall_2ecu_223',['gen_embedding_forward_split_unweighted_nobag_kernel_small.cu',['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5funweighted_5fv2_5fkernel_2ecu_224',['gen_embedding_forward_split_unweighted_v2_kernel.cu',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5funweighted_5fvbe_5fcodegen_5fcuda_2ecu_225',['gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu',['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5funweighted_5fvbe_5fcodegen_5fmeta_2ecpp_226',['gen_embedding_forward_split_unweighted_vbe_codegen_meta.cpp',['../gen__embedding__forward__split__unweighted__vbe__codegen__meta_8cpp.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5funweighted_5fvbe_5fkernel_2ecu_227',['gen_embedding_forward_split_unweighted_vbe_kernel.cu',['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5fweighted_5fcodegen_5fcuda_2ecu_228',['gen_embedding_forward_split_weighted_codegen_cuda.cu',['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5fweighted_5fcodegen_5fmeta_2ecpp_229',['gen_embedding_forward_split_weighted_codegen_meta.cpp',['../gen__embedding__forward__split__weighted__codegen__meta_8cpp.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5fweighted_5fkernel_2ecu_230',['gen_embedding_forward_split_weighted_kernel.cu',['../gen__embedding__forward__split__weighted__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5fweighted_5fv2_5fkernel_2ecu_231',['gen_embedding_forward_split_weighted_v2_kernel.cu',['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5fweighted_5fvbe_5fcodegen_5fcuda_2ecu_232',['gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu',['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5fweighted_5fvbe_5fcodegen_5fmeta_2ecpp_233',['gen_embedding_forward_split_weighted_vbe_codegen_meta.cpp',['../gen__embedding__forward__split__weighted__vbe__codegen__meta_8cpp.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5fweighted_5fvbe_5fkernel_2ecu_234',['gen_embedding_forward_split_weighted_vbe_kernel.cu',['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fadagrad_5fsplit_5fdevice_5fkernel_2ecuh_235',['gen_embedding_optimizer_adagrad_split_device_kernel.cuh',['../gen__embedding__optimizer__adagrad__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fadam_5fsplit_5fdevice_5fkernel_2ecuh_236',['gen_embedding_optimizer_adam_split_device_kernel.cuh',['../gen__embedding__optimizer__adam__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fapprox_5frowwise_5fadagrad_5fsplit_5fdevice_5fkernel_2ecuh_237',['gen_embedding_optimizer_approx_rowwise_adagrad_split_device_kernel.cuh',['../gen__embedding__optimizer__approx__rowwise__adagrad__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fapprox_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5fdevice_5fkernel_2ecuh_238',['gen_embedding_optimizer_approx_rowwise_adagrad_with_counter_split_device_kernel.cuh',['../gen__embedding__optimizer__approx__rowwise__adagrad__with__counter__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5fdevice_5fkernel_2ecuh_239',['gen_embedding_optimizer_approx_rowwise_adagrad_with_weight_decay_split_device_kernel.cuh',['../gen__embedding__optimizer__approx__rowwise__adagrad__with__weight__decay__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fapprox_5fsgd_5fsplit_5fdevice_5fkernel_2ecuh_240',['gen_embedding_optimizer_approx_sgd_split_device_kernel.cuh',['../gen__embedding__optimizer__approx__sgd__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fdense_5fsplit_5fdevice_5fkernel_2ecuh_241',['gen_embedding_optimizer_dense_split_device_kernel.cuh',['../gen__embedding__optimizer__dense__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5flamb_5fsplit_5fdevice_5fkernel_2ecuh_242',['gen_embedding_optimizer_lamb_split_device_kernel.cuh',['../gen__embedding__optimizer__lamb__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5flars_5fsgd_5fsplit_5fdevice_5fkernel_2ecuh_243',['gen_embedding_optimizer_lars_sgd_split_device_kernel.cuh',['../gen__embedding__optimizer__lars__sgd__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fnone_5fsplit_5fdevice_5fkernel_2ecuh_244',['gen_embedding_optimizer_none_split_device_kernel.cuh',['../gen__embedding__optimizer__none__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fpartial_5frowwise_5fadam_5fsplit_5fdevice_5fkernel_2ecuh_245',['gen_embedding_optimizer_partial_rowwise_adam_split_device_kernel.cuh',['../gen__embedding__optimizer__partial__rowwise__adam__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fpartial_5frowwise_5flamb_5fsplit_5fdevice_5fkernel_2ecuh_246',['gen_embedding_optimizer_partial_rowwise_lamb_split_device_kernel.cuh',['../gen__embedding__optimizer__partial__rowwise__lamb__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5frowwise_5fadagrad_5fsplit_2ecpp_247',['gen_embedding_optimizer_rowwise_adagrad_split.cpp',['../gen__embedding__optimizer__rowwise__adagrad__split_8cpp.html',1,'']]], + ['gen_5fembedding_5foptimizer_5frowwise_5fadagrad_5fsplit_5fcuda_2ecu_248',['gen_embedding_optimizer_rowwise_adagrad_split_cuda.cu',['../gen__embedding__optimizer__rowwise__adagrad__split__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5foptimizer_5frowwise_5fadagrad_5fsplit_5fdevice_5fkernel_2ecuh_249',['gen_embedding_optimizer_rowwise_adagrad_split_device_kernel.cuh',['../gen__embedding__optimizer__rowwise__adagrad__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5frowwise_5fadagrad_5fsplit_5fkernel_2ecu_250',['gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu',['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5foptimizer_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5fdevice_5fkernel_2ecuh_251',['gen_embedding_optimizer_rowwise_adagrad_with_counter_split_device_kernel.cuh',['../gen__embedding__optimizer__rowwise__adagrad__with__counter__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5fdevice_5fkernel_2ecuh_252',['gen_embedding_optimizer_rowwise_adagrad_with_weight_decay_split_device_kernel.cuh',['../gen__embedding__optimizer__rowwise__adagrad__with__weight__decay__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5frowwise_5fweighted_5fadagrad_5fsplit_5fdevice_5fkernel_2ecuh_253',['gen_embedding_optimizer_rowwise_weighted_adagrad_split_device_kernel.cuh',['../gen__embedding__optimizer__rowwise__weighted__adagrad__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fsgd_5fsplit_5fdevice_5fkernel_2ecuh_254',['gen_embedding_optimizer_sgd_split_device_kernel.cuh',['../gen__embedding__optimizer__sgd__split__device__kernel_8cuh.html',1,'']]], + ['generate_5flxu_5fcache_5flocations_255',['generate_lxu_cache_locations',['../uvm__cache__miss__emulate__test_8cpp.html#ad12ee38ec43f8659ee8ce4f63f3857f4',1,'uvm_cache_miss_emulate_test.cpp']]], + ['generate_5frandom_5ftable_256',['generate_random_table',['../bench__utils_8cuh.html#a0899793cc86846edfa6ccefb7905f55c',1,'bench_utils.cuh']]], + ['generate_5fvbe_5fmetadata_257',['generate_vbe_metadata',['../split__embeddings__utils_8cuh.html#ae0dcbedd529d5873ad0cac75397cb1f8',1,'generate_vbe_metadata(const at::Tensor &B_offsets, const at::Tensor &B_offsets_rank_per_feature, const at::Tensor &output_offsets_feature_rank, const at::Tensor &D_offsets, const int64_t D, const bool nobag, const int64_t max_B_feature_rank, const int64_t info_B_num_bits, const int64_t total_B): generate_vbe_metadata.cu'],['../generate__vbe__metadata_8cu.html#a9c89bc26edc2d2f4014204d89bd846eb',1,'generate_vbe_metadata(const Tensor &B_offsets, const Tensor &B_offsets_rank_per_feature, const Tensor &output_offsets_feature_rank, const Tensor &D_offsets, const int64_t D, const bool nobag, const int64_t max_B_feature_rank, const int64_t info_B_num_bits, const int64_t total_B): generate_vbe_metadata.cu']]], + ['generate_5fvbe_5fmetadata_2ecu_258',['generate_vbe_metadata.cu',['../generate__vbe__metadata_8cu.html',1,'']]], + ['generic_5fhistogram_5fbinning_5fcalibration_5fby_5ffeature_5fcpu_259',['generic_histogram_binning_calibration_by_feature_cpu',['../group__sparse-data-cpu.html#gaef2a0a8c27e3b8b2d72be5c95ba7539e',1,'fbgemm_gpu']]], + ['generic_5fhistogram_5fbinning_5fcalibration_5fby_5ffeature_5fcuda_260',['generic_histogram_binning_calibration_by_feature_cuda',['../namespacefbgemm__gpu.html#af9209d9d3ea127b5941dcab75bbfd39c',1,'fbgemm_gpu']]], + ['genericpackedtensoraccessor_261',['GenericPackedTensorAccessor',['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor.html',1,'GenericPackedTensorAccessor< T, N, PtrTraits, index_t >'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor.html#a05cb3acde0a408e40526aad85584b274',1,'fbgemm_gpu::GenericPackedTensorAccessor::GenericPackedTensorAccessor(PtrType data, const index_t *const sizes, const index_t *const strides, const char *const ptr_name, const char *const func_name)'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor.html#aa8ff94c7184e151415673957258747e2',1,'fbgemm_gpu::GenericPackedTensorAccessor::GenericPackedTensorAccessor(PtrType data, const source_index_t *const sizes, const source_index_t *const strides, const char *const ptr_name, const char *const func_name)'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html#a05cb3acde0a408e40526aad85584b274',1,'fbgemm_gpu::GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >::GenericPackedTensorAccessor(PtrType data, const index_t *const sizes, const index_t *const strides, const char *const ptr_name, const char *const func_name)'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html#aa8ff94c7184e151415673957258747e2',1,'fbgemm_gpu::GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >::GenericPackedTensorAccessor(PtrType data, const source_index_t *const sizes, const source_index_t *const strides, const char *const ptr_name, const char *const func_name)']]], + ['genericpackedtensoraccessor_3c_20t_2c_201_2c_20ptrtraits_2c_20index_5ft_20_3e_262',['GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >',['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html',1,'fbgemm_gpu']]], + ['genericpackedtensoraccessorbase_263',['GenericPackedTensorAccessorBase',['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html',1,'GenericPackedTensorAccessorBase< T, N, PtrTraits, index_t >'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#ad3b41b3123d1d8bfc0e530b2323dde07',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::GenericPackedTensorAccessorBase(PtrType data, const index_t *const sizes, const index_t *const strides, const char *const ptr_name, const char *const func_name)'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#aab339f541ab3ce6195cabda68f736598',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::GenericPackedTensorAccessorBase(PtrType data, const source_index_t *const sizes, const source_index_t *const strides, const char *const ptr_name, const char *const func_name)']]], + ['genericpackedtensoraccessorbase_3c_20t_2c_201_2c_20ptrtraits_2c_20index_5ft_20_3e_264',['GenericPackedTensorAccessorBase< T, 1, PtrTraits, index_t >',['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html',1,'fbgemm_gpu']]], + ['genericpackedtensoraccessorbase_3c_20t_2c_20n_2c_20defaultptrtraits_2c_20int64_5ft_20_3e_265',['GenericPackedTensorAccessorBase< T, N, DefaultPtrTraits, int64_t >',['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html',1,'fbgemm_gpu']]], + ['get_266',['get',['../classssd_1_1_embedding_rocks_d_b.html#a9a5671e5de645f247452456ffdfa81a9',1,'ssd::EmbeddingRocksDB']]], + ['get_5fcuda_267',['get_cuda',['../classssd_1_1_embedding_rocks_d_b.html#ac8082829ce873543f6388ddbd16362e8',1,'ssd::EmbeddingRocksDB']]], + ['get_5fd_5fbytes_268',['get_D_bytes',['../embedding__inplace__update__test_8cpp.html#a602d9bde988d40aaa1d846c76f8d87c7',1,'embedding_inplace_update_test.cpp']]], + ['get_5fdevice_5findex_5ffrom_5ftensor_269',['get_device_index_from_tensor',['../sparse__ops__utils_8h.html#a672c3da6666124b2950b2eef43587bc6',1,'get_device_index_from_tensor(const at::Tensor &ten): sparse_ops_utils.h'],['../sparse__ops__utils_8h.html#af97638412af3aea185ac327ebe398542',1,'get_device_index_from_tensor(const c10::optional< at::Tensor > &ten): sparse_ops_utils.h']]], + ['get_5fgroup_5findex_5fselect_5fcols_5fper_5fwarp_270',['get_group_index_select_cols_per_warp',['../namespacefbgemm__gpu.html#a4296f0fdcb9a3dcfdd67549340e8f38c',1,'fbgemm_gpu']]], + ['get_5finfos_5fmetadata_271',['get_infos_metadata',['../split__embeddings__utils_8cuh.html#a0994f8d37247e9754d069f16ee195c01',1,'get_infos_metadata(at::Tensor unused, int64_t B, int64_t T): get_infos_metadata.cu'],['../get__infos__metadata_8cu.html#a487bdb340f5c93165158a37aaf156fe9',1,'get_infos_metadata(Tensor unused, int64_t B, int64_t T): get_infos_metadata.cu']]], + ['get_5finfos_5fmetadata_2ecu_272',['get_infos_metadata.cu',['../get__infos__metadata_8cu.html',1,'']]], + ['get_5fnext_5fbag_5fboundary_5fand_5fl_273',['get_next_bag_boundary_and_L',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a7cc9e1627beb86ecc866da06957e0fff',1,'get_next_bag_boundary_and_L(const uint32_t bag_boundary, int32_t *const next_boundary, uint32_t *const L): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a7cc9e1627beb86ecc866da06957e0fff',1,'get_next_bag_boundary_and_L(const uint32_t bag_boundary, int32_t *const next_boundary, uint32_t *const L): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a7cc9e1627beb86ecc866da06957e0fff',1,'get_next_bag_boundary_and_L(const uint32_t bag_boundary, int32_t *const next_boundary, uint32_t *const L): embedding_forward_split_kernel_v2_template.cu']]], + ['get_5fnvlink_5fmatrix_274',['get_nvlink_matrix',['../namespacefbgemm__gpu.html#ae554e4e9d8789449846323c52f840fe8',1,'fbgemm_gpu']]], + ['get_5funique_5findices_5fcuda_275',['get_unique_indices_cuda',['../group__table-batched-embed-cuda.html#ga4887151424a90cfd0abef174a4e91f3f',1,'get_unique_indices_cuda(at::Tensor linear_indices, int64_t max_indices, bool compute_count): linearize_cache_indices.cu'],['../group__table-batched-embed-cuda.html#ga4887151424a90cfd0abef174a4e91f3f',1,'get_unique_indices_cuda(Tensor linear_indices, int64_t max_indices, bool compute_count): linearize_cache_indices.cu']]], + ['get_5fvalid_5fcpu_5ftensor_276',['get_valid_cpu_tensor',['../sparse__ops__utils__test_8cpp.html#a740d263ecb80b6e7cf28a86f561450b7',1,'sparse_ops_utils_test.cpp']]], + ['getpointer_277',['getPointer',['../structfbgemm__gpu_1_1_shared_memory_3_01int64__t_01_4.html#ac04ebca5545952c6185a2693bc5d9fc9',1,'fbgemm_gpu::SharedMemory< int64_t >::getPointer()'],['../structfbgemm__gpu_1_1_shared_memory_3_01int32__t_01_4.html#a3472f2fcb0b65202627a7a5d0b47ab8f',1,'fbgemm_gpu::SharedMemory< int32_t >::getPointer()'],['../structfbgemm__gpu_1_1_shared_memory_3_01float_01_4.html#a11507d418a31c798c09f74aa6569fb72',1,'fbgemm_gpu::SharedMemory< float >::getPointer()'],['../structfbgemm__gpu_1_1_shared_memory_3_01double_01_4.html#a53ef47c469305fb8b5427b2a0063db6f',1,'fbgemm_gpu::SharedMemory< double >::getPointer()'],['../structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01float_00_01true_01_4_01_4_01_4.html#aa277fc58794548c1d2619afa9cd0be9e',1,'fbgemm_gpu::SharedMemory< Vec4T< at::acc_type< float, true > > >::getPointer()'],['../structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01double_00_01true_01_4_01_4_01_4.html#a79e2902e4ab8379789578754af90253f',1,'fbgemm_gpu::SharedMemory< Vec4T< at::acc_type< double, true > > >::getPointer()']]], + ['getscalartype_278',['getScalarType',['../namespacefbgemm__gpu.html#ac7d6b4d86c0ce57c3af88ea03123fdb4',1,'fbgemm_gpu']]], + ['getsparsetype_279',['getSparseType',['../namespacefbgemm__gpu.html#a7dbc3a3bde83bfe7a18b720197f0f830',1,'fbgemm_gpu']]], + ['global_5fwarp_5fid_280',['global_warp_id',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a53d1bd761ca2346d5b9bcc60d1c43be6',1,'global_warp_id: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a53d1bd761ca2346d5b9bcc60d1c43be6',1,'global_warp_id: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['gpuatomicincrement_281',['gpuAtomicIncrement',['../embedding__backward__template__helpers_8cuh.html#aa054bfcfa5ed7f584d2811fe48a2f757',1,'embedding_backward_template_helpers.cuh']]], + ['grad_5fdev_5findices_282',['grad_dev_indices',['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#af8b6c324711f37bf86e87d3d74f65c2e',1,'gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['grad_5fdev_5fweights_283',['grad_dev_weights',['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#a18e7685a5fc29b232d08a33a75c44ca2',1,'gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['grad_5foutput_284',['grad_output',['../namespacefbgemm__gpu.html#a2a24c1ec3db68358edcac4561d38a0d1',1,'fbgemm_gpu']]], + ['grad_5fsum_285',['grad_sum',['../namespacefbgemm__gpu.html#ae1519b6699f9dca1080e9230f3d95245',1,'fbgemm_gpu']]], + ['grad_5fweight_286',['grad_weight',['../namespacefbgemm__gpu.html#a5b80925f60fbc21517ec3a2e137b78bd',1,'fbgemm_gpu']]], + ['group_5findex_5fselect_5fcols_5fper_5fwarp_287',['GROUP_INDEX_SELECT_COLS_PER_WARP',['../namespacefbgemm__gpu.html#a0d76fd54f347327376ed8ba28ff66bfc',1,'fbgemm_gpu']]], + ['group_5findex_5fselect_5fdim0_5fgpu_288',['group_index_select_dim0_gpu',['../namespacefbgemm__gpu.html#a33cd874aab109dc15436869064c3d689',1,'fbgemm_gpu']]], + ['group_5findex_5fselect_5fdim0_5fgpu_5fbackward_5fmeta_289',['group_index_select_dim0_gpu_backward_meta',['../namespacefbgemm__gpu.html#a213539d8845a20efd90e93fed16f1090',1,'fbgemm_gpu']]], + ['group_5findex_5fselect_5fdim0_5fgpu_5fimpl_290',['group_index_select_dim0_gpu_impl',['../namespacefbgemm__gpu.html#abda14dada6ae2b39b175ed52824dbfa5',1,'fbgemm_gpu']]], + ['group_5findex_5fselect_5fdim0_5fgpu_5fimpl_5fmeta_291',['group_index_select_dim0_gpu_impl_meta',['../namespacefbgemm__gpu.html#a8d89670eae5b860788cb14175f01ce7e',1,'fbgemm_gpu']]], + ['group_5findex_5fselect_5fdim0_5funpack_292',['group_index_select_dim0_unpack',['../namespacefbgemm__gpu.html#ac4851777dc16c28c94a2cc9b58d3923c',1,'fbgemm_gpu']]], + ['group_5findex_5fselect_5flog_5fcols_5fper_5fwarp_293',['GROUP_INDEX_SELECT_LOG_COLS_PER_WARP',['../namespacefbgemm__gpu.html#a696ffb981f6c273f77aae0cf102b1f6b',1,'fbgemm_gpu']]], + ['group_5findex_5fselect_5for_5fadd_5fcuda_294',['group_index_select_or_add_cuda',['../namespacefbgemm__gpu.html#a394db33cacde2480607d48fe227274ef',1,'fbgemm_gpu']]], + ['group_5findex_5fselect_5funroll_5ffactor_295',['GROUP_INDEX_SELECT_UNROLL_FACTOR',['../namespacefbgemm__gpu.html#a693bb0de52991f987fe81dc61c750403',1,'fbgemm_gpu']]], + ['group_5fsize_296',['group_size',['../namespacefbgemm__gpu.html#af0a2fbea18e37c564b3cada4172d96ff',1,'fbgemm_gpu']]], + ['gt_297',['gt',['../structfbgemm__gpu_1_1_comparator.html#a869e6734f5357dab7a63300629b414c8',1,'fbgemm_gpu::Comparator']]] ]; diff --git a/search/all_8.js b/search/all_8.js index bf0276894..8bd98e0c4 100644 --- a/search/all_8.js +++ b/search/all_8.js @@ -1,8 +1,23 @@ var searchData= [ - ['jagged_20tensor_20cuda_20operators_0',['Jagged Tensor CUDA Operators',['../group__jagged-tensor-ops-cuda.html',1,'']]], - ['jagged_20tensor_20operators_1',['Jagged Tensor Operators',['../group__jagged-tensor-ops-cpu.html',1,'']]], - ['jagged_5fdense_5felementwise_5fadd_2',['jagged_dense_elementwise_add',['../group__jagged-tensor-ops-cpu.html#gaa797caaa08c70857433ae987d9cf30d7',1,'fbgemm_gpu']]], - ['jagged_5fdense_5felementwise_5fadd_5fjagged_5foutput_3',['jagged_dense_elementwise_add_jagged_output',['../group__jagged-tensor-ops-cpu.html#ga1290f40c3ba39837dd009c3006353d7c',1,'fbgemm_gpu']]], - ['jagged_5fdense_5felementwise_5fadd_5fjagged_5foutput_5fcuda_4',['jagged_dense_elementwise_add_jagged_output_cuda',['../group__jagged-tensor-ops-cuda.html#gad34ac20d2c9be5a6489c8e8befff7938',1,'fbgemm_gpu']]] + ['half4_0',['Half4',['../structfbgemm__gpu_1_1_half4.html',1,'fbgemm_gpu']]], + ['half4_1',['half4',['../jagged__tensor__ops_2common_8cuh.html#ac6142811afa7f90ec76eae1bc05da82b',1,'common.cuh']]], + ['half8_2',['half8',['../jagged__tensor__ops_2common_8cuh.html#a93d30ba34e45e42dfd6b2547b1652cb6',1,'common.cuh']]], + ['half_5fto_5ffused8bitrowwise_5fcpu_3',['half_to_fused8bitrowwise_cpu',['../group__quantize-data-cpu.html#gaa9daf4f3dc64238a5de8f82bbae656cf',1,'fbgemm_gpu']]], + ['half_5fto_5ffusednbitrowwise_5fcpu_4',['half_to_fusednbitrowwise_cpu',['../namespacefbgemm__gpu.html#a545dc5567b0a08c31f65e2fc7ae21749',1,'fbgemm_gpu']]], + ['has_5fgauss_5',['has_gauss',['../structfbgemm__gpu_1_1rk__state.html#a629587b5f04293ea2b0bf452faa48344',1,'fbgemm_gpu::rk_state']]], + ['hex_6',['HEX',['../_c_make_c_compiler_id_8c.html#a46d5d95daa1bef867bd0179594310ed5',1,'HEX: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#a46d5d95daa1bef867bd0179594310ed5',1,'HEX: CMakeCXXCompilerId.cpp']]], + ['hfma2_7',['hfma2',['../namespacefbgemm__gpu.html#a3ff3d0d7b40d8f2909fa6b35d64d250d',1,'fbgemm_gpu']]], + ['hfp8_5fto_5ffloat_8',['hfp8_to_float',['../namespacefbgemm__gpu.html#a1f35a2d3a2ede2e58e7986f8c2c757ec',1,'fbgemm_gpu']]], + ['histogram_5fbinning_5fcalibration_5fby_5ffeature_5fcpu_9',['histogram_binning_calibration_by_feature_cpu',['../namespacefbgemm__gpu.html#a499764d7156d294219e3ae2629ae229f',1,'fbgemm_gpu']]], + ['histogram_5fbinning_5fcalibration_5fby_5ffeature_5fcuda_10',['histogram_binning_calibration_by_feature_cuda',['../namespacefbgemm__gpu.html#ac639ce2e71982d5d1da0a30c92858aa8',1,'fbgemm_gpu']]], + ['histogram_5fbinning_5fcalibration_5fcpu_11',['histogram_binning_calibration_cpu',['../group__sparse-data-cpu.html#ga201bb2241fc9d582d6c0fe968b0e71ca',1,'fbgemm_gpu']]], + ['histogram_5fbinning_5fcalibration_5fcuda_12',['histogram_binning_calibration_cuda',['../namespacefbgemm__gpu.html#a1b19059704ba1911efbedf4adcbb0ee3',1,'fbgemm_gpu']]], + ['histogram_5fbinning_5fcalibration_5fops_2ecu_13',['histogram_binning_calibration_ops.cu',['../histogram__binning__calibration__ops_8cu.html',1,'']]], + ['hmul_14',['hmul',['../namespacefbgemm__gpu.html#ab50e28187eb7fdf5b8cd74cd8150b025',1,'fbgemm_gpu']]], + ['hmul_5fshort2_15',['hmul_short2',['../namespacefbgemm__gpu.html#a257181e3db25da8e4d1b4ef73976271d',1,'fbgemm_gpu']]], + ['host_16',['HOST',['../namespacefbgemm__gpu.html#a8f04cbe33fa88d1e420c06b1f8879194ab9361011891280a44d85b967739cc6a5',1,'fbgemm_gpu']]], + ['host_5flxu_5fcache_5fslot_17',['host_lxu_cache_slot',['../group__table-batched-embed-cuda.html#ga920da453c443675fc7fbc9d68e272a61',1,'host_lxu_cache_slot(int64_t h_in, int64_t C): lxu_cache.cu'],['../group__table-batched-embed-cuda.html#ga920da453c443675fc7fbc9d68e272a61',1,'host_lxu_cache_slot(int64_t h_in, int64_t C): lxu_cache.cu']]], + ['hostasynchronousthreadpoolexecutor_18',['hostAsynchronousThreadPoolExecutor',['../namespacessd.html#ac14b5cc833767dd1941b5c2de7153299',1,'ssd']]], + ['hypercompressedsparsecolumn_19',['HyperCompressedSparseColumn',['../structinternal_1_1_hyper_compressed_sparse_column.html',1,'internal']]] ]; diff --git a/search/all_9.js b/search/all_9.js index 8a8780d6c..842aecf05 100644 --- a/search/all_9.js +++ b/search/all_9.js @@ -1,16 +1,74 @@ var searchData= [ - ['layout_20transformation_20cpu_20operators_0',['Layout Transformation CPU Operators',['../group__layout-transform-cpu.html',1,'']]], - ['layout_20transformation_20cuda_20operators_1',['Layout Transformation CUDA Operators',['../group__layout-transform-cuda.html',1,'']]], - ['lfu_5fcache_5fpopulate_5fbyte_5fcuda_2',['lfu_cache_populate_byte_cuda',['../group__table-batched-embed-cuda.html#ga2b76a0cf452f00e77696d896d7a402f3',1,'lfu_cache_populate_byte_cuda(Tensor weights, Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor linear_cache_indices, Tensor lxu_cache_state, Tensor lxu_cache_weights, Tensor lfu_state, int64_t row_alignment): lfu_cache_populate_byte.cu'],['../group__table-batched-embed-cuda.html#ga2b76a0cf452f00e77696d896d7a402f3',1,'lfu_cache_populate_byte_cuda(at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, int64_t row_alignment): lfu_cache_populate_byte.cu']]], - ['lfu_5fcache_5fpopulate_5fcuda_3',['lfu_cache_populate_cuda',['../group__table-batched-embed-cuda.html#ga854b8951ef7e78da812be97041d7d2dc',1,'lfu_cache_populate_cuda(Tensor weights, Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor D_offsets, Tensor linear_cache_indices, Tensor lxu_cache_state, Tensor lxu_cache_weights, Tensor lfu_state, bool stochastic_rounding): lfu_cache_populate.cu'],['../group__table-batched-embed-cuda.html#ga854b8951ef7e78da812be97041d7d2dc',1,'lfu_cache_populate_cuda(at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, bool stochastic_rounding): lfu_cache_populate.cu']]], - ['linearize_5fcache_5findices_5fcuda_4',['linearize_cache_indices_cuda',['../group__table-batched-embed-cuda.html#ga23e7545e51b296d9b72c86f37c360dc6',1,'linearize_cache_indices_cuda(at::Tensor cache_hash_size_cumsum, at::Tensor indices, at::Tensor offsets): linearize_cache_indices.cu'],['../group__table-batched-embed-cuda.html#ga23e7545e51b296d9b72c86f37c360dc6',1,'linearize_cache_indices_cuda(Tensor cache_hash_size_cumsum, Tensor indices, Tensor offsets): linearize_cache_indices.cu']]], - ['linearize_5fcache_5findices_5ffrom_5frow_5fidx_5fcuda_5',['linearize_cache_indices_from_row_idx_cuda',['../group__table-batched-embed-cuda.html#ga6eed85d3e9b5dbef8a753bb81c2d6e05',1,'linearize_cache_indices_from_row_idx_cuda(Tensor cache_hash_size_cumsum, Tensor update_table_indices, Tensor update_row_indices): linearize_cache_indices.cu'],['../group__table-batched-embed-cuda.html#ga6eed85d3e9b5dbef8a753bb81c2d6e05',1,'linearize_cache_indices_from_row_idx_cuda(at::Tensor cache_hash_size_cumsum, at::Tensor update_table_indices, at::Tensor update_row_indices): linearize_cache_indices.cu']]], - ['lru_5fcache_5ffind_5funcached_5fcuda_6',['lru_cache_find_uncached_cuda',['../group__table-batched-embed-cuda.html#ga76807cfe283a9e8f258818f3f439e6cd',1,'lru_cache_find_uncached_cuda(at::Tensor unique_indices, at::Tensor unique_indices_length, int64_t max_indices, at::Tensor lxu_cache_state, int64_t time_stamp, at::Tensor lru_state, bool gather_cache_stats, at::Tensor uvm_cache_stats, bool lock_cache_line, at::Tensor lxu_cache_locking_counter): lru_cache_find.cu'],['../group__table-batched-embed-cuda.html#ga76807cfe283a9e8f258818f3f439e6cd',1,'lru_cache_find_uncached_cuda(Tensor unique_indices, Tensor unique_indices_length, int64_t max_indices, Tensor lxu_cache_state, int64_t time_stamp, Tensor lru_state, bool gather_cache_stats, Tensor uvm_cache_stats, bool lock_cache_line, Tensor lxu_cache_locking_counter): lru_cache_find.cu']]], - ['lru_5fcache_5fpopulate_5fbyte_5fcuda_7',['lru_cache_populate_byte_cuda',['../group__table-batched-embed-cuda.html#ga5958e4cecc978d415714a3dd691fbc11',1,'split_embeddings_cache_cuda.cuh']]], - ['lru_5fcache_5fpopulate_5fcuda_8',['lru_cache_populate_cuda',['../group__table-batched-embed-cuda.html#ga00d12767ad238d73598bf7dc4d1afa06',1,'split_embeddings_cache_cuda.cuh']]], - ['lxu_5fcache_5fflush_5fcuda_9',['lxu_cache_flush_cuda',['../group__table-batched-embed-cuda.html#ga2b055aeb5bf2d99bfb4351271764cab1',1,'lxu_cache_flush_cuda(at::Tensor uvm_weights, at::Tensor cache_hash_size_cumsum, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, int64_t total_D, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, bool stochastic_rounding): lxu_cache.cu'],['../group__table-batched-embed-cuda.html#ga2b055aeb5bf2d99bfb4351271764cab1',1,'lxu_cache_flush_cuda(Tensor uvm_weights, Tensor cache_hash_size_cumsum, Tensor cache_index_table_map, Tensor weights_offsets, Tensor D_offsets, int64_t total_D, Tensor lxu_cache_state, Tensor lxu_cache_weights, bool stochastic_rounding): lxu_cache.cu']]], - ['lxu_5fcache_5flocations_5fupdate_5fcuda_10',['lxu_cache_locations_update_cuda',['../group__table-batched-embed-cuda.html#ga65cba33a439fb1ed50fe2e80dc22b603',1,'split_embeddings_cache_cuda.cuh']]], - ['lxu_5fcache_5flocking_5fcounter_5fdecrement_5fcuda_11',['lxu_cache_locking_counter_decrement_cuda',['../group__table-batched-embed-cuda.html#gaeaf8f13290f0fe389fefa3fc2a944311',1,'lxu_cache_locking_counter_decrement_cuda(at::Tensor lxu_cache_locking_counter, at::Tensor lxu_cache_locations): lxu_cache.cu'],['../group__table-batched-embed-cuda.html#gaeaf8f13290f0fe389fefa3fc2a944311',1,'lxu_cache_locking_counter_decrement_cuda(at::Tensor lxu_cache_locking_counter, at::Tensor lxu_cache_locations): lxu_cache.cu']]], - ['lxu_5fcache_5flookup_5fcuda_12',['lxu_cache_lookup_cuda',['../group__table-batched-embed-cuda.html#ga124b70b0fede88f508e59111ce6d765f',1,'split_embeddings_cache_cuda.cuh']]] + ['idx_0',['idx',['../namespacefbgemm__gpu.html#a9d7e9481c420588a334b2aedac0f5af4',1,'fbgemm_gpu']]], + ['if_1',['if',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a618af795eb1829b78b342e084130e1f4',1,'if(t >=T): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a37c3fe73e60868097d45b151e9c4a430',1,'if(is_zero_total_L): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a49e7c05f68f0175f3c44c6b1c12c5117',1,'if(is_small_L &&table_warp_id >=num_warps_for_small_L *8): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a1958ec7365ff8575f7973e15353c0121',1,'if(threadIdx.x==0): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a509435224d0201170dbceeef2d47698f',1,'if(table_warp_id >=num_warps_per_row *(is_small_L ? num_warps_for_small_L :B)): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a4ab8250d245b6612c02d934b63fdcd52',1,'if(is_small_L): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aaa19ed116a2acf1b1ef0527b77b3d4ec',1,'if(L<=1): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#af6822b01edff1e16c53f21b0c6142ffd',1,'if(load_D - load_d< kWarpSize): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a618af795eb1829b78b342e084130e1f4',1,'if(t >=T): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a37c3fe73e60868097d45b151e9c4a430',1,'if(is_zero_total_L): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a49e7c05f68f0175f3c44c6b1c12c5117',1,'if(is_small_L &&table_warp_id >=num_warps_for_small_L *8): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a1958ec7365ff8575f7973e15353c0121',1,'if(threadIdx.x==0): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a509435224d0201170dbceeef2d47698f',1,'if(table_warp_id >=num_warps_per_row *(is_small_L ? num_warps_for_small_L :B)): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a4ab8250d245b6612c02d934b63fdcd52',1,'if(is_small_L): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aaa19ed116a2acf1b1ef0527b77b3d4ec',1,'if(L<=1): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../bench__utils_8cuh.html#aa3487d3e764e516ac71de417077959a6',1,'if(do_write *val): bench_utils.cuh'],['../namespacefbgemm__gpu.html#a29ef435892df0dc6cd3fa9769486e659',1,'fbgemm_gpu::if(index >=num_logits)'],['../namespacefbgemm__gpu.html#a32dace4feb1fa305053fd440163ba422',1,'fbgemm_gpu::if(curr_bin_num_examples > bin_ctr_in_use_after)'],['../namespacefbgemm__gpu.html#a4b4f7604af9accc2a43a8e060b6145e7',1,'fbgemm_gpu::if(index >=num_lengths - 1)'],['../namespacefbgemm__gpu.html#ac2276128422f0c744cc68659b731d53a',1,'fbgemm_gpu::if(next_offset==curr_offset+1)'],['../namespacefbgemm__gpu.html#a1d72e092775be40f6a57865b410d55e9',1,'fbgemm_gpu::if(list_id >=num_lists)'],['../namespacefbgemm__gpu.html#aa41e0708c4b465d4a89e0c1de6a60dd1',1,'fbgemm_gpu::if(per_sample_weights_addrs)'],['../namespacefbgemm__gpu.html#a6080a87e4588877fbbdd8a03d16d927d',1,'fbgemm_gpu::if(b >=B)'],['../namespacefbgemm__gpu.html#a9e204163946d36c19beef5443a1b71b6',1,'fbgemm_gpu::if(n >=N)'],['../namespacefbgemm__gpu.html#aa6453091b8359fcc2da599396bb27f52',1,'fbgemm_gpu::if(run_id >=sorted_linear_indices_run.size(0))'],['../namespacefbgemm__gpu.html#ad0904756703f278e8c03d0be1918211b',1,'fbgemm_gpu::if(run_id >=sorted_linear_indices_num_runs[0])'],['../namespacefbgemm__gpu.html#aaf49df4f26b7eff1308265a096c0c768',1,'fbgemm_gpu::if(SL==0)'],['../namespacefbgemm__gpu.html#a426625b7d5c06c4059e34784c1fdd74f',1,'fbgemm_gpu::if(t >=T||b >=batch_size_per_feature[t])'],['../namespacefbgemm__gpu.html#ae198c10fa781aa859c0e8666fc10063b',1,'fbgemm_gpu::if(i >=input_size)'],['../namespacefbgemm__gpu.html#a1958ec7365ff8575f7973e15353c0121',1,'fbgemm_gpu::if(threadIdx.x==0)']]], + ['ignore_2',['IGNORE',['../namespacefbgemm__gpu.html#a70433200cf584e2429434a33d45111eaaa2e843feab94ef623fea888f07c28696',1,'fbgemm_gpu']]], + ['inclusive_5fsum_5fscan_5fkernel_3',['inclusive_sum_scan_kernel',['../namespacefbgemm__gpu.html#ae86238f4ca864fb4ea41318ece747ab4',1,'fbgemm_gpu']]], + ['index_5fadd_4',['index_add',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#a2f087d87df54652b9059bfa56b7c0dc3',1,'fbgemm_gpu::Vec4StepT< STEP, float >::index_add()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#a2f087d87df54652b9059bfa56b7c0dc3',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::index_add()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#a2f087d87df54652b9059bfa56b7c0dc3',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::index_add()']]], + ['index_5fadd_5fwith_5funique_5findices_5fcuda_5',['index_add_with_unique_indices_cuda',['../namespacefbgemm__gpu.html#a80e08c6c5c1ebf2b34c6490eee0e8415',1,'fbgemm_gpu']]], + ['index_5ffma_6',['index_fma',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#ad71e777976812302bf4173ce00641b55',1,'fbgemm_gpu::Vec4StepT< STEP, float >::index_fma()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#ad71e777976812302bf4173ce00641b55',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::index_fma()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#ad71e777976812302bf4173ce00641b55',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::index_fma()']]], + ['index_5fselect_5fcuda_7',['index_select_cuda',['../namespacefbgemm__gpu.html#a543ba161110516ef84a9fbeb83c7af5c',1,'fbgemm_gpu']]], + ['index_5fselect_5fdim0_5fgpu_8',['index_select_dim0_gpu',['../namespacefbgemm__gpu.html#a170ff30798a3bcf42cc3f0669f938450',1,'fbgemm_gpu']]], + ['index_5fselect_5fscalar_5fcumsum_5fkernel_9',['index_select_scalar_cumsum_kernel',['../namespacefbgemm__gpu.html#aa762379def70fcfe1f15ff2a347af4a9',1,'fbgemm_gpu']]], + ['index_5fstore_10',['index_store',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#a53ce8d22f3e5051594ff8799ede7167a',1,'fbgemm_gpu::Vec4StepT< STEP, float >::index_store(uint32_t idx, float4 *ptr)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#a3a736a75cd874d0a755c64bc2d5dbf36',1,'fbgemm_gpu::Vec4StepT< STEP, float >::index_store(uint32_t idx, float2 *ptr)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#ab6ad661dbc7d9699747b0ec4f268c92c',1,'fbgemm_gpu::Vec4StepT< STEP, float >::index_store(uint32_t idx, uint8_t *ptr)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#a53ce8d22f3e5051594ff8799ede7167a',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::index_store(uint32_t idx, float4 *ptr)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#a3a736a75cd874d0a755c64bc2d5dbf36',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::index_store(uint32_t idx, float2 *ptr)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#ab6ad661dbc7d9699747b0ec4f268c92c',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::index_store(uint32_t idx, uint8_t *ptr)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#a53ce8d22f3e5051594ff8799ede7167a',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::index_store(uint32_t idx, float4 *ptr)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#a3a736a75cd874d0a755c64bc2d5dbf36',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::index_store(uint32_t idx, float2 *ptr)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#ab6ad661dbc7d9699747b0ec4f268c92c',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::index_store(uint32_t idx, uint8_t *ptr)']]], + ['index_5fweighted_5fstore_11',['index_weighted_store',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#a8063756b0e7d2c067a4f7ec2c8f117c1',1,'fbgemm_gpu::Vec4StepT< STEP, float >::index_weighted_store(uint32_t idx, float4 *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#a76e58bf5fe9b795864d627ba6748d7d7',1,'fbgemm_gpu::Vec4StepT< STEP, float >::index_weighted_store(uint32_t idx, float2 *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#aae4a2ca3b742838cf705dcfd6b62b9ad',1,'fbgemm_gpu::Vec4StepT< STEP, float >::index_weighted_store(uint32_t idx, uint8_t *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#a8063756b0e7d2c067a4f7ec2c8f117c1',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::index_weighted_store(uint32_t idx, float4 *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#a76e58bf5fe9b795864d627ba6748d7d7',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::index_weighted_store(uint32_t idx, float2 *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#aae4a2ca3b742838cf705dcfd6b62b9ad',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::index_weighted_store(uint32_t idx, uint8_t *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#a8063756b0e7d2c067a4f7ec2c8f117c1',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::index_weighted_store(uint32_t idx, float4 *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#a76e58bf5fe9b795864d627ba6748d7d7',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::index_weighted_store(uint32_t idx, float2 *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#aae4a2ca3b742838cf705dcfd6b62b9ad',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::index_weighted_store(uint32_t idx, uint8_t *ptr, const float weight)']]], + ['index_5fweights_12',['index_weights',['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#a79f22b62b5882d0d141e2797331c3262',1,'index_weights: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a79f22b62b5882d0d141e2797331c3262',1,'index_weights: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#a79f22b62b5882d0d141e2797331c3262',1,'index_weights: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu']]], + ['indices_13',['indices',['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html#acbf20500022fb5f972956bea423a05ff',1,'indices: gen_batch_index_select_dim0_forward_kernel_small.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html#a983b46d1ccd1b8d7ee0f786801acdabf',1,'indices: gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#a2d7f9971f231260d0da708ce6bf6c179',1,'indices: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html#ac6808dbd8c1563373cd2bf230c07e283',1,'indices: gen_embedding_forward_split_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a6df12c527b79f006699968f24d774fcb',1,'indices: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#a2d7f9971f231260d0da708ce6bf6c179',1,'indices: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#a6df12c527b79f006699968f24d774fcb',1,'indices: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a6df12c527b79f006699968f24d774fcb',1,'indices: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#a6df12c527b79f006699968f24d774fcb',1,'indices: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../namespacefbgemm__gpu.html#aabefe307b5a16f2e2d2c5cc6c74719b6',1,'fbgemm_gpu::indices']]], + ['indices_5faddrs_14',['indices_addrs',['../namespacefbgemm__gpu.html#a192b4d5303123cf4b57b1491cd42e36e',1,'fbgemm_gpu']]], + ['indices_5fdata_15',['indices_data',['../namespacefbgemm__gpu.html#acb7eb1c50758e407a638a81723961f56',1,'fbgemm_gpu']]], + ['indices_5fend_16',['indices_end',['../namespacefbgemm__gpu.html#ac7c7ecdd5162f325b65a6b5c5c6c40ca',1,'fbgemm_gpu']]], + ['indices_5fis_5flong_17',['indices_is_long',['../namespacefbgemm__gpu.html#a3d08a36103c24a910afe1dbfa89e3060',1,'fbgemm_gpu']]], + ['indices_5foffsets_18',['indices_offsets',['../namespacefbgemm__gpu.html#af03fdab0a39bf13b8ec4de336253b8aa',1,'fbgemm_gpu']]], + ['indices_5fptrs_19',['indices_ptrs',['../namespacefbgemm__gpu.html#a7e26138f974174b1cd94f35321fef17d',1,'fbgemm_gpu']]], + ['indices_5fstart_20',['indices_start',['../namespacefbgemm__gpu.html#a43255cb54bbd791afb26a23af02acfec',1,'fbgemm_gpu']]], + ['indices_5fto_5flb_21',['indices_to_lb',['../namespacefbgemm__gpu.html#af069d2baffbfbe0b8aae6aea56d31e86',1,'fbgemm_gpu']]], + ['info_22',['info',['../namespacefbgemm__gpu.html#aa494944475a226c613cdd03931ba061d',1,'fbgemm_gpu']]], + ['info_5farch_23',['info_arch',['../_c_make_c_compiler_id_8c.html#a59647e99d304ed33b15cb284c27ed391',1,'info_arch: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#a59647e99d304ed33b15cb284c27ed391',1,'info_arch: CMakeCXXCompilerId.cpp']]], + ['info_5fb_5fmask_24',['info_B_mask',['../namespacefbgemm__gpu.html#acdf5304fcbfbc6f85054b8c45407691f',1,'fbgemm_gpu']]], + ['info_5fb_5fnum_5fbits_25',['info_B_num_bits',['../namespacefbgemm__gpu.html#a4558e86e39e5639ec4665246b76df453',1,'fbgemm_gpu']]], + ['info_5fcompiler_26',['info_compiler',['../_c_make_c_compiler_id_8c.html#a4b0efeb7a5d59313986b3a0390f050f6',1,'info_compiler: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#a4b0efeb7a5d59313986b3a0390f050f6',1,'info_compiler: CMakeCXXCompilerId.cpp']]], + ['info_5flanguage_5fextensions_5fdefault_27',['info_language_extensions_default',['../_c_make_c_compiler_id_8c.html#a0f46a8a39e09d9b803c4766904fd7e99',1,'info_language_extensions_default: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#a0f46a8a39e09d9b803c4766904fd7e99',1,'info_language_extensions_default: CMakeCXXCompilerId.cpp']]], + ['info_5flanguage_5fstandard_5fdefault_28',['info_language_standard_default',['../_c_make_c_compiler_id_8c.html#a4607cccf070750927b458473ca82c090',1,'info_language_standard_default: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#a4607cccf070750927b458473ca82c090',1,'info_language_standard_default: CMakeCXXCompilerId.cpp']]], + ['info_5fplatform_29',['info_platform',['../_c_make_c_compiler_id_8c.html#a2321403dee54ee23f0c2fa849c60f7d4',1,'info_platform: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#a2321403dee54ee23f0c2fa849c60f7d4',1,'info_platform: CMakeCXXCompilerId.cpp']]], + ['initializer_30',['Initializer',['../classssd_1_1_initializer.html',1,'Initializer'],['../classssd_1_1_initializer.html#af5e246dd12f1a6c4e06ab77a41bd0590',1,'ssd::Initializer::Initializer()']]], + ['input_20operators_31',['Combine Input Operators',['../group__input-combine.html',1,'']]], + ['input_5fcombine_2ecu_32',['input_combine.cu',['../input__combine_8cu.html',1,'']]], + ['input_5fcombine_2eh_33',['input_combine.h',['../input__combine_8h.html',1,'']]], + ['input_5fcombine_5fcpu_2ecpp_34',['input_combine_cpu.cpp',['../input__combine__cpu_8cpp.html',1,'']]], + ['input_5fcombine_5fgpu_2ecpp_35',['input_combine_gpu.cpp',['../input__combine__gpu_8cpp.html',1,'']]], + ['input_5foffsets_36',['input_offsets',['../namespacefbgemm__gpu.html#a88aea1b3f2194509bb8bb7105e0d6553',1,'fbgemm_gpu']]], + ['input_5fsize_37',['input_size',['../namespacefbgemm__gpu.html#a5549affa3c112bf0c71b0e2323eb0c14',1,'fbgemm_gpu']]], + ['instantiate_5fbatched_5fcsr2csc_38',['INSTANTIATE_BATCHED_CSR2CSC',['../embedding__forward__split__cpu_8cpp.html#a32da455953694aac0b5e837bd3f1c31a',1,'embedding_forward_split_cpu.cpp']]], + ['int_39',['INT',['../namespacefbgemm__gpu.html#aa7e45742197542f659233c21b883ba60a53f93baa3057821107c750323892fa92',1,'fbgemm_gpu']]], + ['int2_40',['INT2',['../namespacefbgemm__gpu.html#a47b4476e5f749d63e15d2f8e55be833ea8fbf1fab49398b0d298699ea3ccbebc5',1,'fbgemm_gpu']]], + ['int32_5ft_41',['int32_t',['../namespacefbgemm__gpu.html#a112ef14feafbe22a3b70fd5ddcefcf99',1,'fbgemm_gpu']]], + ['int4_42',['INT4',['../namespacefbgemm__gpu.html#a47b4476e5f749d63e15d2f8e55be833ea94635600f8a63640263a5ebc30d79a2a',1,'fbgemm_gpu']]], + ['int64_5ft_43',['int64_t',['../gen__batch__index__select__dim0__forward__kernel_8cu.html#ac4ebc0de2e60165af8333b6f4eab3e70',1,'int64_t: gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html#ac4ebc0de2e60165af8333b6f4eab3e70',1,'int64_t: gen_batch_index_select_dim0_forward_kernel_small.cu'],['../gen__embedding__backward__split__grad_8cu.html#af261ebff9d4ab236e8dd6bea30db7fb1',1,'int64_t: gen_embedding_backward_split_grad.cu'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#ac4ebc0de2e60165af8333b6f4eab3e70',1,'int64_t: gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#ac4ebc0de2e60165af8333b6f4eab3e70',1,'int64_t: gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html#ac4ebc0de2e60165af8333b6f4eab3e70',1,'int64_t: gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#ac4ebc0de2e60165af8333b6f4eab3e70',1,'int64_t: gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#ac4ebc0de2e60165af8333b6f4eab3e70',1,'int64_t: gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#ac4ebc0de2e60165af8333b6f4eab3e70',1,'int64_t: gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html#ac4ebc0de2e60165af8333b6f4eab3e70',1,'int64_t: gen_embedding_forward_split_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#ac4ebc0de2e60165af8333b6f4eab3e70',1,'int64_t: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#ac4ebc0de2e60165af8333b6f4eab3e70',1,'int64_t: gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#ac4ebc0de2e60165af8333b6f4eab3e70',1,'int64_t: gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ac4ebc0de2e60165af8333b6f4eab3e70',1,'int64_t: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#ac4ebc0de2e60165af8333b6f4eab3e70',1,'int64_t: gen_embedding_forward_split_weighted_vbe_kernel.cu']]], + ['int8_44',['INT8',['../namespacefbgemm__gpu.html#a47b4476e5f749d63e15d2f8e55be833eaee9d73311ff0658494edfff14c3ec1e3',1,'fbgemm_gpu']]], + ['int_5fnbit_5fsplit_5fembedding_5fcodegen_5fforward_5funweighted_5fcpu_45',['int_nbit_split_embedding_codegen_forward_unweighted_cpu',['../gen__embedding__forward__quantized__unweighted__codegen__cpu_8cpp.html#a718e1ac4e0fa56a96e666ee2d5a5c40a',1,'int_nbit_split_embedding_codegen_forward_unweighted_cpu(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, Tensor indices, Tensor offsets, int64_t pooling_mode, int64_t row_alignment, int64_t output_dtype, int64_t fp8_exponent_bits, int64_t fp8_exponent_bias): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp'],['../embedding__forward__quantized__host__cpu_8cpp.html#a718e1ac4e0fa56a96e666ee2d5a5c40a',1,'int_nbit_split_embedding_codegen_forward_unweighted_cpu(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, Tensor indices, Tensor offsets, int64_t pooling_mode, int64_t row_alignment, int64_t output_dtype, int64_t fp8_exponent_bits, int64_t fp8_exponent_bias): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp']]], + ['int_5fnbit_5fsplit_5fembedding_5fcodegen_5fforward_5funweighted_5fcuda_46',['int_nbit_split_embedding_codegen_forward_unweighted_cuda',['../gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html#a9c3b5fb374c1ef95520bc4e30b66325e',1,'int_nbit_split_embedding_codegen_forward_unweighted_cuda(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, const int64_t total_D, const int64_t max_int2_D, const int64_t max_int4_D, const int64_t max_int8_D, const int64_t max_float16_D, const int64_t max_float32_D, Tensor indices, Tensor offsets, const int64_t pooling_mode, const int64_t row_alignment, const int64_t output_dtype, Tensor lxu_cache_weights, Tensor lxu_cache_locations, const int64_t max_float8_D, const int64_t fp8_exponent_bits, const int64_t fp8_exponent_bias): gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu'],['../embedding__forward__quantized__host_8cpp.html#a5a581a6131f9754699b4e5bb27b20ecb',1,'int_nbit_split_embedding_codegen_forward_unweighted_cuda(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t pooling_mode, int64_t row_alignment, int64_t output_dtype, Tensor lxu_cache_weights, Tensor lxu_cache_locations, int64_t max_float8_D, int64_t fp8_exponent_bits, int64_t fp8_exponent_bias): gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu']]], + ['int_5fnbit_5fsplit_5fembedding_5fcodegen_5fforward_5fweighted_5fcpu_47',['int_nbit_split_embedding_codegen_forward_weighted_cpu',['../gen__embedding__forward__quantized__weighted__codegen__cpu_8cpp.html#a5a1cc170a745f03faefac536cfcbf1e6',1,'int_nbit_split_embedding_codegen_forward_weighted_cpu(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, Tensor indices, Tensor offsets, int64_t pooling_mode, int64_t row_alignment, Tensor indice_weights, int64_t output_dtype, int64_t fp8_exponent_bits, int64_t fp8_exponent_bias): gen_embedding_forward_quantized_weighted_codegen_cpu.cpp'],['../embedding__forward__quantized__host__cpu_8cpp.html#a5a1cc170a745f03faefac536cfcbf1e6',1,'int_nbit_split_embedding_codegen_forward_weighted_cpu(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, Tensor indices, Tensor offsets, int64_t pooling_mode, int64_t row_alignment, Tensor indice_weights, int64_t output_dtype, int64_t fp8_exponent_bits, int64_t fp8_exponent_bias): gen_embedding_forward_quantized_weighted_codegen_cpu.cpp']]], + ['int_5fnbit_5fsplit_5fembedding_5fcodegen_5fforward_5fweighted_5fcuda_48',['int_nbit_split_embedding_codegen_forward_weighted_cuda',['../gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html#ae65cbb34f3d373fe3e12b7bb899c1b10',1,'int_nbit_split_embedding_codegen_forward_weighted_cuda(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, const int64_t total_D, const int64_t max_int2_D, const int64_t max_int4_D, const int64_t max_int8_D, const int64_t max_float16_D, const int64_t max_float32_D, Tensor indices, Tensor offsets, const int64_t pooling_mode, const int64_t row_alignment, Tensor indice_weights, const int64_t output_dtype, Tensor lxu_cache_weights, Tensor lxu_cache_locations, const int64_t max_float8_D, const int64_t fp8_exponent_bits, const int64_t fp8_exponent_bias): gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu'],['../embedding__forward__quantized__host_8cpp.html#a79655cba701e82021eefe7fe8cb72916',1,'int_nbit_split_embedding_codegen_forward_weighted_cuda(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t pooling_mode, int64_t row_alignment, Tensor indice_weights, int64_t output_dtype, Tensor lxu_cache_weights, Tensor lxu_cache_locations, int64_t max_float8_D, int64_t fp8_exponent_bits, int64_t fp8_exponent_bias): gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu']]], + ['int_5fnbit_5fsplit_5fembedding_5fcodegen_5flookup_5ffunction_49',['int_nbit_split_embedding_codegen_lookup_function',['../group__embedding-cuda.html#ga0749f1c6540189dd47b32a56858f82fb',1,'embedding_forward_quantized_host.cpp']]], + ['int_5fnbit_5fsplit_5fembedding_5fcodegen_5flookup_5ffunction_5fcpu_50',['int_nbit_split_embedding_codegen_lookup_function_cpu',['../group__embedding-cpu.html#gac115303550aa9af7c170baef63bcdb00',1,'embedding_forward_quantized_host_cpu.cpp']]], + ['int_5fnbit_5fsplit_5fembedding_5fnobag_5fcodegen_5fforward_5funweighted_5fcpu_51',['int_nbit_split_embedding_nobag_codegen_forward_unweighted_cpu',['../gen__embedding__forward__quantized__unweighted__codegen__cpu_8cpp.html#ab6ae7551f9cd9d5cdb845240887aeaa1',1,'int_nbit_split_embedding_nobag_codegen_forward_unweighted_cpu(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, const int64_t D, Tensor indices, Tensor offsets, int64_t pooling_mode, int64_t row_alignment, int64_t output_dtype, int64_t fp8_exponent_bits, int64_t fp8_exponent_bias): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp'],['../embedding__forward__quantized__host__cpu_8cpp.html#af3d9ee6fd394ec0055de7f2c2acfba3d',1,'int_nbit_split_embedding_nobag_codegen_forward_unweighted_cpu(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, int64_t D, Tensor indices, Tensor offsets, int64_t pooling_mode, int64_t row_alignment, int64_t output_dtype, int64_t fp8_exponent_bits, int64_t fp8_exponent_bias): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp']]], + ['int_5fnbit_5fsplit_5fembedding_5fnobag_5fcodegen_5fforward_5funweighted_5fcuda_52',['int_nbit_split_embedding_nobag_codegen_forward_unweighted_cuda',['../gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html#a9b168b9b2d002f86f7f16211b83fced0',1,'int_nbit_split_embedding_nobag_codegen_forward_unweighted_cuda(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, const int64_t D, const int64_t max_int2_D, const int64_t max_int4_D, const int64_t max_int8_D, const int64_t max_float16_D, const int64_t max_float32_D, Tensor indices, Tensor offsets, const int64_t row_alignment, const int64_t output_dtype, Tensor lxu_cache_weights, Tensor lxu_cache_locations, const int64_t max_float8_D, const int64_t fp8_exponent_bits, const int64_t fp8_exponent_bias): gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu'],['../embedding__forward__quantized__host_8cpp.html#a0545cdf708e09c0958f1538e7b4b29c9',1,'int_nbit_split_embedding_nobag_codegen_forward_unweighted_cuda(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, int64_t D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t row_alignment, int64_t output_dtype, Tensor lxu_cache_weights, Tensor lxu_cache_locations, int64_t max_float8_D, int64_t fp8_exponent_bits, int64_t fp8_exponent_bias): gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu']]], + ['int_5fnbit_5fsplit_5fembedding_5fuvm_5fcaching_5fcodegen_5flookup_5ffunction_53',['int_nbit_split_embedding_uvm_caching_codegen_lookup_function',['../group__embedding-cuda.html#gabbe880100f1036a979f3a8d8755447d0',1,'embedding_forward_quantized_host.cpp']]], + ['int_5fnbit_5fsplit_5fembedding_5fuvm_5fcaching_5fcodegen_5flookup_5ffunction_5fcpu_54',['int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu',['../group__embedding-cpu.html#gaf5c83f0c03200546398764261403749d',1,'embedding_forward_quantized_host_cpu.cpp']]], + ['internal_55',['internal',['../namespaceinternal.html',1,'']]], + ['invalid_56',['INVALID',['../namespacefbgemm__gpu.html#a47b4476e5f749d63e15d2f8e55be833eaccc0377a8afbf50e7094f5c23a8af223',1,'fbgemm_gpu']]], + ['invert_5fpermute_5fcpu_57',['invert_permute_cpu',['../namespacefbgemm__gpu.html#aa79c3b125ba955f02e8ee2e70b1bbd32',1,'fbgemm_gpu']]], + ['invoke_5fgroup_5findex_5fselect_5for_5fadd_58',['INVOKE_GROUP_INDEX_SELECT_OR_ADD',['../sparse__group__index_8cu.html#acc7197a16e3ef386f0fd807a0919110b',1,'sparse_group_index.cu']]], + ['invoke_5fkernel_5fwith_5fdim_59',['INVOKE_KERNEL_WITH_DIM',['../jagged__tensor__ops_2common_8cuh.html#ac4adf873a2fdf50491e9cc9647e3f6cc',1,'INVOKE_KERNEL_WITH_DIM: common.cuh'],['../jagged__tensor__ops_2common_8cuh.html#ac4adf873a2fdf50491e9cc9647e3f6cc',1,'INVOKE_KERNEL_WITH_DIM: common.cuh'],['../jagged__dense__dense__elementwise__add__jagged__output__forward_8cu.html#ac4adf873a2fdf50491e9cc9647e3f6cc',1,'INVOKE_KERNEL_WITH_DIM: jagged_dense_dense_elementwise_add_jagged_output_forward.cu'],['../jagged__dense__elementwise__mul__backward_8cu.html#ac4adf873a2fdf50491e9cc9647e3f6cc',1,'INVOKE_KERNEL_WITH_DIM: jagged_dense_elementwise_mul_backward.cu'],['../jagged__tensor__ops__cpu_8cpp.html#ac4adf873a2fdf50491e9cc9647e3f6cc',1,'INVOKE_KERNEL_WITH_DIM: jagged_tensor_ops_cpu.cpp'],['../jagged__tensor__ops__cpu_8cpp.html#ac4adf873a2fdf50491e9cc9647e3f6cc',1,'INVOKE_KERNEL_WITH_DIM: jagged_tensor_ops_cpu.cpp'],['../jagged__tensor__ops__cpu_8cpp.html#ac4adf873a2fdf50491e9cc9647e3f6cc',1,'INVOKE_KERNEL_WITH_DIM: jagged_tensor_ops_cpu.cpp']]], + ['invoke_5flinearize_5findex_5fkernel_60',['INVOKE_LINEARIZE_INDEX_KERNEL',['../transpose__embedding__input_8cu.html#ac03452638c5653f404a402f9f7356841',1,'transpose_embedding_input.cu']]], + ['invoke_5fprocess_5fall_5findices_61',['INVOKE_PROCESS_ALL_INDICES',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#acaeccb7e2e5908cef08556661b7a6f44',1,'INVOKE_PROCESS_ALL_INDICES: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#acaeccb7e2e5908cef08556661b7a6f44',1,'INVOKE_PROCESS_ALL_INDICES: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#acaeccb7e2e5908cef08556661b7a6f44',1,'INVOKE_PROCESS_ALL_INDICES: embedding_forward_split_kernel_v2_template.cu']]], + ['invoke_5fprocess_5fall_5findices_5fhelper_62',['INVOKE_PROCESS_ALL_INDICES_HELPER',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a65e818853d870f84ef24b703b0e02618',1,'INVOKE_PROCESS_ALL_INDICES_HELPER: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a65e818853d870f84ef24b703b0e02618',1,'INVOKE_PROCESS_ALL_INDICES_HELPER: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a65e818853d870f84ef24b703b0e02618',1,'INVOKE_PROCESS_ALL_INDICES_HELPER: embedding_forward_split_kernel_v2_template.cu']]], + ['is_5faligned_63',['is_aligned',['../namespacefbgemm__gpu.html#ae24b9318a63a9532f426abc0b0e94819',1,'fbgemm_gpu']]], + ['is_5flong_5fidx_64',['is_long_idx',['../namespacefbgemm__gpu.html#a96187c00fa81aaf4d6404cc915a5d7b7',1,'fbgemm_gpu']]], + ['is_5flong_5fmask_65',['is_long_mask',['../namespacefbgemm__gpu.html#ace5ac8a87afdca35747d5c9bd8e33e73',1,'fbgemm_gpu']]], + ['is_5flong_5fnum_5fbits_66',['IS_LONG_NUM_BITS',['../namespacefbgemm__gpu.html#ab9c0e24618d9ec723a7fcc8653c0dd59',1,'fbgemm_gpu']]], + ['is_5fsmall_5fl_67',['is_small_L',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a58d2a94da907a301d9cd71dffefa25c3',1,'is_small_L: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a58d2a94da907a301d9cd71dffefa25c3',1,'is_small_L: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['is_5fuvm_5ftensor_68',['is_uvm_tensor',['../group__cumem-utils.html#gacba28ed334d071e79c1ead1792391e9d',1,'fbgemm_gpu']]], + ['is_5fzero_5ftotal_5fl_69',['is_zero_total_L',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#ac5d11523cb9e630706dead6e236d9385',1,'is_zero_total_L: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ac5d11523cb9e630706dead6e236d9385',1,'is_zero_total_L: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['items_5f_70',['items_',['../classfbgemm__gpu_1_1enum__registration.html#addb614cfff2cdb5220c587cbfd7b08fb',1,'fbgemm_gpu::enum_registration']]] ]; diff --git a/search/all_a.js b/search/all_a.js index 4ddf8079f..ebce08b6b 100644 --- a/search/all_a.js +++ b/search/all_a.js @@ -1,5 +1,89 @@ var searchData= [ - ['memorty_20operators_0',['CUDA Memorty Operators',['../group__cumem-utils.html',1,'']]], - ['merge_20operators_1',['Merge Operators',['../group__merge-pooled-emb.html',1,'']]] + ['jagged_20tensor_20cuda_20operators_0',['Jagged Tensor CUDA Operators',['../group__jagged-tensor-ops-cuda.html',1,'']]], + ['jagged_20tensor_20operators_1',['Jagged Tensor Operators',['../group__jagged-tensor-ops-cpu.html',1,'']]], + ['jagged_5f1d_5fto_5fdense_2',['jagged_1d_to_dense',['../group__jagged-tensor-ops-cpu.html#ga93b5edf03f38d8eaf9a0f1ece0bc1af7',1,'fbgemm_gpu']]], + ['jagged_5f1d_5fto_5fdense_5fmeta_3',['jagged_1d_to_dense_meta',['../namespacefbgemm__gpu.html#afdde1bd5a99cc5bcdfaf27b4c42cad7b',1,'fbgemm_gpu']]], + ['jagged_5f2d_5fto_5fdense_4',['jagged_2d_to_dense',['../group__jagged-tensor-ops-cpu.html#gaaa301b81a22a3d823ba5e65828093113',1,'fbgemm_gpu']]], + ['jagged_5f2d_5fto_5fdense_5fforward_5fcpu_5',['jagged_2d_to_dense_forward_cpu',['../namespacefbgemm__gpu.html#a70d2cdc82d96c9c4298b57133393a800',1,'fbgemm_gpu']]], + ['jagged_5f2d_5fto_5fdense_5fgpu_5fbackward_6',['jagged_2d_to_dense_gpu_backward',['../namespacefbgemm__gpu.html#a7c104248a9abcdcdac6bdcac571930a4',1,'fbgemm_gpu']]], + ['jagged_5f2d_5fto_5fdense_5fgpu_5fforward_7',['jagged_2d_to_dense_gpu_forward',['../namespacefbgemm__gpu.html#a56c28427858ea272148bdbfb9f373191',1,'fbgemm_gpu']]], + ['jagged_5f2d_5fto_5fdense_5fmeta_8',['jagged_2d_to_dense_meta',['../namespacefbgemm__gpu.html#a67b19e389f869540bd35510d4e8e7908',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fbmm_9',['jagged_dense_bmm',['../namespacefbgemm__gpu.html#aed181c3885f392fec8c38cdf10266d68',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fbmm_5fforward_10',['jagged_dense_bmm_forward',['../namespacefbgemm__gpu.html#a3eec1622180be9b7a31891d5e9f2ba71',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fbmm_5fforward_2ecu_11',['jagged_dense_bmm_forward.cu',['../jagged__dense__bmm__forward_8cu.html',1,'']]], + ['jagged_5fdense_5fbmm_5fforward_5fcuda_12',['jagged_dense_bmm_forward_cuda',['../namespacefbgemm__gpu.html#a4961acd2615018dff4fdf1390158f0a4',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fbmm_5fforward_5fmeta_13',['jagged_dense_bmm_forward_meta',['../namespacefbgemm__gpu.html#a022cdaaee01f619cf0cb7b29d80cbc65',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fbmm_5fkernel_14',['jagged_dense_bmm_kernel',['../namespacefbgemm__gpu.html#a6c32f4b4ccfdef9cf63d463cb235ec38',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fdense_5felementwise_5fadd_5fjagged_5foutput_15',['jagged_dense_dense_elementwise_add_jagged_output',['../namespacefbgemm__gpu.html#a47e4d714a08316066470d979f97f1d81',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fdense_5felementwise_5fadd_5fjagged_5foutput_5fforward_16',['jagged_dense_dense_elementwise_add_jagged_output_forward',['../namespacefbgemm__gpu.html#a10611541bdce9c65bfe48a01474d1725',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fdense_5felementwise_5fadd_5fjagged_5foutput_5fforward_2ecu_17',['jagged_dense_dense_elementwise_add_jagged_output_forward.cu',['../jagged__dense__dense__elementwise__add__jagged__output__forward_8cu.html',1,'']]], + ['jagged_5fdense_5fdense_5felementwise_5fadd_5fjagged_5foutput_5fforward_5fmeta_18',['jagged_dense_dense_elementwise_add_jagged_output_forward_meta',['../namespacefbgemm__gpu.html#a56cac54ea3d7672c629010018ba59568',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fdense_5felementwise_5fadd_5fjagged_5foutput_5fmeta_19',['jagged_dense_dense_elementwise_add_jagged_output_meta',['../namespacefbgemm__gpu.html#ab421ce372347f826b7e7ff9e35f26c93',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fdense_5felementwise_5fjagged_5foutput_5f_20',['jagged_dense_dense_elementwise_jagged_output_',['../namespacefbgemm__gpu.html#a319b3f5f33bec0aff79f0ee990483f3d',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fdense_5felementwise_5fjagged_5foutput_5fmatches_5fopt_21',['jagged_dense_dense_elementwise_jagged_output_matches_opt',['../namespacefbgemm__gpu.html#adfb04060c9eecdadcf59b3c15d5bca08',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fdense_5felementwise_5fjagged_5foutput_5fopt_5f_22',['jagged_dense_dense_elementwise_jagged_output_opt_',['../namespacefbgemm__gpu.html#aac40d60c62b0d176a962cdad964e34f6',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fadd_23',['jagged_dense_elementwise_add',['../group__jagged-tensor-ops-cpu.html#gaa797caaa08c70857433ae987d9cf30d7',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fadd_5fjagged_5foutput_24',['jagged_dense_elementwise_add_jagged_output',['../group__jagged-tensor-ops-cpu.html#ga1290f40c3ba39837dd009c3006353d7c',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fadd_5fjagged_5foutput_5fcuda_25',['jagged_dense_elementwise_add_jagged_output_cuda',['../group__jagged-tensor-ops-cuda.html#gad34ac20d2c9be5a6489c8e8befff7938',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fadd_5fjagged_5foutput_5fmeta_26',['jagged_dense_elementwise_add_jagged_output_meta',['../namespacefbgemm__gpu.html#a16d84a11c2e32cb0064721354fb190b7',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fadd_5fmeta_27',['jagged_dense_elementwise_add_meta',['../namespacefbgemm__gpu.html#aff88b44d096bd7a039dca72a5855198c',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fjagged_5foutput_5f_28',['jagged_dense_elementwise_jagged_output_',['../namespacefbgemm__gpu.html#a124d128a82ffb0342ce597d0325060fb',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fjagged_5foutput_5fopt_5f_29',['jagged_dense_elementwise_jagged_output_opt_',['../namespacefbgemm__gpu.html#aded7d8ce8ffbcce568c498fb32a7d071',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fmul_30',['jagged_dense_elementwise_mul',['../group__jagged-tensor-ops-cpu.html#ga5521ad46f5bab0d77c8bb036742f455d',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fmul_5fbackward_31',['jagged_dense_elementwise_mul_backward',['../namespacefbgemm__gpu.html#a6de8f2f64f7d90ab1997df02470a9564',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fmul_5fbackward_2ecu_32',['jagged_dense_elementwise_mul_backward.cu',['../jagged__dense__elementwise__mul__backward_8cu.html',1,'']]], + ['jagged_5fdense_5felementwise_5fmul_5fbackward_5fmeta_33',['jagged_dense_elementwise_mul_backward_meta',['../namespacefbgemm__gpu.html#abfbf6c239d283084ed1c68f18ea24af5',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fmul_5fforward_34',['jagged_dense_elementwise_mul_forward',['../namespacefbgemm__gpu.html#aaa297ab58f55125d7eb7b040cc4c254b',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fmul_5fforward_2ecu_35',['jagged_dense_elementwise_mul_forward.cu',['../jagged__dense__elementwise__mul__forward_8cu.html',1,'']]], + ['jagged_5fdense_5felementwise_5fmul_5fforward_5fmeta_36',['jagged_dense_elementwise_mul_forward_meta',['../namespacefbgemm__gpu.html#ac30cb8e7e035c24bf4f6ac15bf1b623a',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fmul_5fmeta_37',['jagged_dense_elementwise_mul_meta',['../namespacefbgemm__gpu.html#aaeeacda7f3587bfe9bf2ecf376dd635e',1,'fbgemm_gpu']]], + ['jagged_5fhash_5fsize_5fcumsum_5fcuda_38',['jagged_hash_size_cumsum_cuda',['../namespacefbgemm__gpu.html#aabd8b530d0ac7e5cb96cf19c7eb517e9',1,'fbgemm_gpu']]], + ['jagged_5findex_5fadd_5f2d_5fforward_2ecu_39',['jagged_index_add_2d_forward.cu',['../jagged__index__add__2d__forward_8cu.html',1,'']]], + ['jagged_5findex_5fadd_5f2d_5fforward_5fcpu_40',['jagged_index_add_2d_forward_cpu',['../namespacefbgemm__gpu.html#af80524a7d454f6db1c478808e8a659a6',1,'fbgemm_gpu']]], + ['jagged_5findex_5fadd_5f2d_5fforward_5fcuda_41',['jagged_index_add_2d_forward_cuda',['../namespacefbgemm__gpu.html#a53a6da74de342260dcb15c68e9bddfd6',1,'fbgemm_gpu']]], + ['jagged_5findex_5fadd_5f2d_5fforward_5fv2_5fimpl_42',['jagged_index_add_2d_forward_v2_impl',['../namespacefbgemm__gpu.html#a8e1ed94256304ab16b948117d5315ee2',1,'fbgemm_gpu']]], + ['jagged_5findex_5fadd_5f2d_5fkernel_43',['jagged_index_add_2d_kernel',['../namespacefbgemm__gpu.html#ab571c6d5519c86bddfe58835c8209a4c',1,'fbgemm_gpu']]], + ['jagged_5findex_5fselect_5f2d_44',['jagged_index_select_2d',['../namespacefbgemm__gpu.html#aca95193cb0cc3db7030f18cb59c6cc33',1,'fbgemm_gpu']]], + ['jagged_5findex_5fselect_5f2d_5fforward_2ecu_45',['jagged_index_select_2d_forward.cu',['../jagged__index__select__2d__forward_8cu.html',1,'']]], + ['jagged_5findex_5fselect_5f2d_5fforward_5fcpu_46',['jagged_index_select_2d_forward_cpu',['../namespacefbgemm__gpu.html#a71a54a14d90862afc8e5fe03e0c9ed8f',1,'fbgemm_gpu']]], + ['jagged_5findex_5fselect_5f2d_5fforward_5fcuda_47',['jagged_index_select_2d_forward_cuda',['../namespacefbgemm__gpu.html#acb5a744fbd29c8a3a25621c2850686c1',1,'fbgemm_gpu']]], + ['jagged_5findex_5fselect_5f2d_5fforward_5fv2_5fimpl_48',['jagged_index_select_2d_forward_v2_impl',['../namespacefbgemm__gpu.html#acd9af0fd221ab3fc330ca9f278433a3f',1,'fbgemm_gpu']]], + ['jagged_5findex_5fselect_5f2d_5fkernel_49',['jagged_index_select_2d_kernel',['../namespacefbgemm__gpu.html#ab1228b502a424869c5a7353f9fe52316',1,'fbgemm_gpu']]], + ['jagged_5fjagged_5fbmm_50',['jagged_jagged_bmm',['../namespacefbgemm__gpu.html#ae94c97196a7c392695b64f0db906ff4c',1,'fbgemm_gpu']]], + ['jagged_5fjagged_5fbmm_5fforward_51',['jagged_jagged_bmm_forward',['../namespacefbgemm__gpu.html#a5b01fcfb83764115f38eeab21c28a6a3',1,'fbgemm_gpu']]], + ['jagged_5fjagged_5fbmm_5fforward_2ecu_52',['jagged_jagged_bmm_forward.cu',['../jagged__jagged__bmm__forward_8cu.html',1,'']]], + ['jagged_5fjagged_5fbmm_5fforward_5fcuda_53',['jagged_jagged_bmm_forward_cuda',['../namespacefbgemm__gpu.html#a0793a1a7b328d1351b6036d0be6a9c3d',1,'fbgemm_gpu']]], + ['jagged_5fjagged_5fbmm_5fforward_5fmeta_54',['jagged_jagged_bmm_forward_meta',['../namespacefbgemm__gpu.html#a2722fce931f20d923aba071236be4c87',1,'fbgemm_gpu']]], + ['jagged_5fjagged_5fbmm_5fkernel_55',['jagged_jagged_bmm_kernel',['../namespacefbgemm__gpu.html#a33c7044a13254607610928c6825738b1',1,'fbgemm_gpu']]], + ['jagged_5fjagged_5felementwise_5fdense_5foutput_5f_56',['jagged_jagged_elementwise_dense_output_',['../namespacefbgemm__gpu.html#a8fa5d329cfcc18c3304ba018919004ff',1,'fbgemm_gpu']]], + ['jagged_5fslice_57',['jagged_slice',['../namespacefbgemm__gpu.html#ab17aab73b431292434fd0d642a538960',1,'fbgemm_gpu']]], + ['jagged_5fslice_5fforward_5fcpu_58',['jagged_slice_forward_cpu',['../namespacefbgemm__gpu.html#a4e6521d00a6f81ad8ad7f7d38eef1aea',1,'fbgemm_gpu']]], + ['jagged_5fslice_5fforward_5fcpu_5fkernel_59',['jagged_slice_forward_cpu_kernel',['../namespacefbgemm__gpu.html#a284b652fdac146671fc324ac57d2ad5d',1,'fbgemm_gpu']]], + ['jagged_5fsoftmax_60',['jagged_softmax',['../namespacefbgemm__gpu.html#a069ed261b53e7051b85f3e572cad7f7e',1,'fbgemm_gpu']]], + ['jagged_5fsoftmax_5fbackward_61',['jagged_softmax_backward',['../namespacefbgemm__gpu.html#a7ba518434a034920e1092bf6d73879fd',1,'fbgemm_gpu']]], + ['jagged_5fsoftmax_5fbackward_2ecu_62',['jagged_softmax_backward.cu',['../jagged__softmax__backward_8cu.html',1,'']]], + ['jagged_5fsoftmax_5fbackward_5fcuda_63',['jagged_softmax_backward_cuda',['../namespacefbgemm__gpu.html#a305d9969e73060e49580aab1456ceb35',1,'fbgemm_gpu']]], + ['jagged_5fsoftmax_5fbackward_5fkernel_64',['jagged_softmax_backward_kernel',['../namespacefbgemm__gpu.html#a7101ddaed8357d824a9eeeaff67e5c4c',1,'fbgemm_gpu']]], + ['jagged_5fsoftmax_5fbackward_5fmeta_65',['jagged_softmax_backward_meta',['../namespacefbgemm__gpu.html#aad25e4e44afa7169c17e48d726ee0477',1,'fbgemm_gpu']]], + ['jagged_5fsoftmax_5fforward_66',['jagged_softmax_forward',['../namespacefbgemm__gpu.html#a023a8d9db48d27efcd2e77ede6366f5d',1,'fbgemm_gpu']]], + ['jagged_5fsoftmax_5fforward_2ecu_67',['jagged_softmax_forward.cu',['../jagged__softmax__forward_8cu.html',1,'']]], + ['jagged_5fsoftmax_5fforward_5fcuda_68',['jagged_softmax_forward_cuda',['../namespacefbgemm__gpu.html#ab117510dd56fd42f3d774d22633b107f',1,'fbgemm_gpu']]], + ['jagged_5fsoftmax_5fforward_5fmeta_69',['jagged_softmax_forward_meta',['../namespacefbgemm__gpu.html#ac14e78d89697f34bcaa7c0a725c8a04a',1,'fbgemm_gpu']]], + ['jagged_5fsoftmax_5fkernel_70',['jagged_softmax_kernel',['../namespacefbgemm__gpu.html#a20e3d96daba045e321717b025f4124cc',1,'fbgemm_gpu']]], + ['jagged_5ftensor_5fdispatch_5fdims_71',['JAGGED_TENSOR_DISPATCH_DIMS',['../sparse__ops__utils_8h.html#a8f3cc6f3a1a83750715b4ddcb228ca8b',1,'sparse_ops_utils.h']]], + ['jagged_5ftensor_5fops_2ecu_72',['jagged_tensor_ops.cu',['../jagged__tensor__ops_8cu.html',1,'']]], + ['jagged_5ftensor_5fops_5fautograd_2ecpp_73',['jagged_tensor_ops_autograd.cpp',['../jagged__tensor__ops__autograd_8cpp.html',1,'']]], + ['jagged_5ftensor_5fops_5fcpu_2ecpp_74',['jagged_tensor_ops_cpu.cpp',['../jagged__tensor__ops__cpu_8cpp.html',1,'']]], + ['jagged_5ftensor_5fops_5fmeta_2ecpp_75',['jagged_tensor_ops_meta.cpp',['../jagged__tensor__ops__meta_8cpp.html',1,'']]], + ['jagged_5fto_5fpadded_5fdense_76',['jagged_to_padded_dense',['../group__jagged-tensor-ops-cpu.html#ga6d19e2c055144e4fe59b06999be34670',1,'fbgemm_gpu']]], + ['jagged_5fto_5fpadded_5fdense_5fbackward_77',['jagged_to_padded_dense_backward',['../namespacefbgemm__gpu.html#a861454c4383e6a0869a6c007fc498eed',1,'fbgemm_gpu']]], + ['jagged_5fto_5fpadded_5fdense_5fbackward_2ecu_78',['jagged_to_padded_dense_backward.cu',['../jagged__to__padded__dense__backward_8cu.html',1,'']]], + ['jagged_5fto_5fpadded_5fdense_5fbackward_5fmeta_79',['jagged_to_padded_dense_backward_meta',['../namespacefbgemm__gpu.html#a8663dcc9727a468507eb75a849ae5820',1,'fbgemm_gpu']]], + ['jagged_5fto_5fpadded_5fdense_5fforward_80',['jagged_to_padded_dense_forward',['../group__jagged-tensor-ops-cuda.html#gaffad7e38f6faf5f8365784fbf82a26f5',1,'fbgemm_gpu']]], + ['jagged_5fto_5fpadded_5fdense_5fforward_2ecu_81',['jagged_to_padded_dense_forward.cu',['../jagged__to__padded__dense__forward_8cu.html',1,'']]], + ['jagged_5fto_5fpadded_5fdense_5fforward_5fmeta_82',['jagged_to_padded_dense_forward_meta',['../namespacefbgemm__gpu.html#a4fc6df6df430f9f9a20d7fe9d88dd009',1,'fbgemm_gpu']]], + ['jagged_5fto_5fpadded_5fdense_5fmeta_83',['jagged_to_padded_dense_meta',['../namespacefbgemm__gpu.html#ae45c299345273bf31be20e4893f58c28',1,'fbgemm_gpu']]], + ['jagged_5funique_5findices_2ecu_84',['jagged_unique_indices.cu',['../jagged__unique__indices_8cu.html',1,'']]], + ['jagged_5funique_5findices_5fcuda_85',['jagged_unique_indices_cuda',['../namespacefbgemm__gpu.html#a006273b56cd5a2efd001ad71d801a551',1,'fbgemm_gpu']]] ]; diff --git a/search/all_b.js b/search/all_b.js index 9ca306e65..edd3377ec 100644 --- a/search/all_b.js +++ b/search/all_b.js @@ -1,5 +1,13 @@ var searchData= [ - ['new_5fmanaged_5ftensor_0',['new_managed_tensor',['../group__cumem-utils.html#gab708b23762a11187eb6a32a36f0e34a3',1,'fbgemm_gpu']]], - ['new_5fvanilla_5fmanaged_5ftensor_1',['new_vanilla_managed_tensor',['../group__cumem-utils.html#gad5e0d2307667c3db5e73f0c0eec15df5',1,'fbgemm_gpu']]] + ['kbackwardmaxthreads_0',['kBackwardMaxThreads',['../embedding__backward__template__helpers_8cuh.html#a1844f7d12c928eeeab43f95ae91376c7',1,'embedding_backward_template_helpers.cuh']]], + ['kcachelocationmissing_1',['kCacheLocationMissing',['../embedding__forward__template__helpers_8cuh.html#a377d2c34d1f3becb19a91ea600e05321',1,'kCacheLocationMissing: embedding_forward_template_helpers.cuh'],['../embedding__backward__template__helpers_8cuh.html#a377d2c34d1f3becb19a91ea600e05321',1,'kCacheLocationMissing: embedding_backward_template_helpers.cuh'],['../namespacefbgemm__gpu.html#a377d2c34d1f3becb19a91ea600e05321',1,'fbgemm_gpu::kCacheLocationMissing']]], + ['keyed_5fjagged_5findex_5fadd_5fdim1_5fkernel_2',['keyed_jagged_index_add_dim1_kernel',['../namespacefbgemm__gpu.html#a7d13c6946f45ae31d20aaecbd2316fec',1,'fbgemm_gpu']]], + ['keyed_5fjagged_5findex_5fselect_5fdim1_2ecu_3',['keyed_jagged_index_select_dim1.cu',['../keyed__jagged__index__select__dim1_8cu.html',1,'']]], + ['keyed_5fjagged_5findex_5fselect_5fdim1_5fkernel_4',['keyed_jagged_index_select_dim1_kernel',['../namespacefbgemm__gpu.html#a0a518ef8f85868c32ac832576f8504d9',1,'fbgemm_gpu']]], + ['keyed_5fjagged_5findex_5fselect_5fdim_5f1_5fgpu_5',['keyed_jagged_index_select_dim_1_gpu',['../namespacefbgemm__gpu.html#a50a64d97045199097d3ff83edaf56a1a',1,'fbgemm_gpu']]], + ['kforwardmaxthreads_6',['kForwardMaxThreads',['../embedding__forward__template__helpers_8cuh.html#ac9909b6865afc4a3e07fabe1ed204459',1,'embedding_forward_template_helpers.cuh']]], + ['krowinitbuffersize_7',['kRowInitBufferSize',['../namespacessd.html#a03257f8b2bc7207cc362638228aeb2f6',1,'ssd']]], + ['kstackarraymaxdims_8',['kStackArrayMaxDims',['../sparse__ops__utils_8h.html#ab6183b92f9eac6ca49e3055d79dfc83d',1,'sparse_ops_utils.h']]], + ['kwarpsize_9',['kWarpSize',['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#a4a63994c436795f993c09c5626acfb05',1,'kWarpSize: gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu'],['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#a5cb5e51b17eeacd9818bc06b9eb55ddd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#a5bf3f753d62805ba481f4394edfa3158',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#a6e814e4e84507c4c3d932abf55dc8b86',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#a1df9e821214c938534c26d9ad87c1cff',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#a6c1937cacb2c930220dfb75c2ad2fdb4',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#a375f1380c0a43779a6521f855f7c90ef',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#ad74db204c21ce57463de29efd2b51c22',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#a5f6257aba106ad398e4b4a75471a8642',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__warp_8cu.html#ad8b31de2b716f254b2d55b709a332afa',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_warp.cu'],['../gen__batch__index__select__dim0__backward__kernel__warp_8cu.html#a4c8628eff4245612b72787529fa2588f',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_warp.cu'],['../gen__batch__index__select__dim0__backward__kernel__warp_8cu.html#ae0f0975698d817274d5b21d1dd31285c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_warp.cu'],['../gen__batch__index__select__dim0__backward__kernel__warp_8cu.html#a2166d1c956baff37ca5f2aa75dd5d29e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_warp.cu'],['../gen__batch__index__select__dim0__forward__kernel_8cu.html#a3bf7d511b93dad425030c52ff0b35378',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > total_L_offsets, const int32_t fixed_L_per_warp, const bool permute_output_dim_0_1, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > output): gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__batch__index__select__dim0__forward__kernel_8cu.html#a33f0706d826f38b6f36f4657f5a4bbbd',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > total_L_offsets, const int32_t fixed_L_per_warp, const bool permute_output_dim_0_1, pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > output): gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__batch__index__select__dim0__forward__kernel_8cu.html#ae5ffff834bcf0d76a398a76c06a9d01b',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > total_L_offsets, const int32_t fixed_L_per_warp, const bool permute_output_dim_0_1, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > output): gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__batch__index__select__dim0__forward__kernel_8cu.html#ab824e6081e4272e9f56dd57114a11d1d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > total_L_offsets, const int32_t fixed_L_per_warp, const bool permute_output_dim_0_1, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > output): gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__batch__index__select__dim0__forward__kernel_8cu.html#a0157d8084d739723c62bc11e05187901',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > total_L_offsets, const int32_t fixed_L_per_warp, const bool permute_output_dim_0_1, pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > output): gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__batch__index__select__dim0__forward__kernel_8cu.html#a5732b42f4e3be21733885ce73871b37a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > total_L_offsets, const int32_t fixed_L_per_warp, const bool permute_output_dim_0_1, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > output): gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#a390d0e97c72c325e3497aeaa3226d527',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#a64b75e41b7d50f479b37a8c9cf0c1bcd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#a446498d5289ca85dd627faffea758f45',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#ad471b8ae6cce12a41ac160db1243f289',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#ae31ad4c12fc469e5ea516f04f158b98b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#a445aa60d61ffd3755914ffcf55c1a6fa',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#a91f7f08a7ae090f72ea7236ba0fb5c96',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#a447c3f2918447f50e234bb7c3e2b1532',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#a03451f7ef0e82d0861c795948f00bf9b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#ad19427e173ef6c061d7a98427d69a595',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#a067846db797129cc6a85a87a6009c288',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#acab5c90a244916d389e9273df81384ab',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#a8c2c7cc342e76ed32a9621fd6bc6753e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#a6da3d4d33386cf358b201f5a9a2602bb',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#a59dab5f4ff3072665da93792aa3f85e9',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#ae39679f36fe6a0b7b8846c79f69f4bf9',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a74aa12547ff3a9b9787bcdffe7b95e71',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a5e2c26bd8a7744de11021a9356b59a74',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a98c8243c5834d18ba31ffd8f3a570480',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a9ace33692ea18b9bd6c92308133c4499',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#ad9f02bfae155a2b4114e80ed9ef6390c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a225fbb17a5d73ae68945ddba0baf3960',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#ac93d19a97b3d9f1b1ae742787b03d5ba',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a78f61ea01f92fc50b78d776edda5691c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a2f1fea77b7579d1cab96be89c027396a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a5ebb0d91afb08ea0721308c278b18b89',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a2f21c580a600ad4f25aa58bbcae83e5b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a042eb088419228e49b76f7923732ed0c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#acac50d0765417aed0ba2275ef09e7363',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#ab528e6c3e784b1648ebe89230f6f864c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#ad07738475ef95243e6a5d08e8e6096f1',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#ae4bb5bd4f1fe9caf6f7a1d3107a479ac',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#a28bf244596f3c3376a70af53e767ed7d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#a0386dbc79ea0aaacffbe7cf8cba78167',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#a88a5089ba98be8ad981c0d2fd5c74657',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#ac768cc0753ab5318bbe47835d4f9fa9d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#a557bbce544c0a0b3dd4036ec01b6df55',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#a802903738d859e74795111ec77fb0268',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#a8d14751fd1f29be0069e1a35e0f921e4',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#a1773883a254e0ab07fb0313e41e997d8',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#a735bf953224cfed630501bf38342b07c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#a608b71f09301faa6ce5838495c9e8de1',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#a5860a2f37abc179f0358170ee6403905',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#a21576335b9047871158e90e2032e8912',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#a04bf660a884cfa9ce91901a66fd99f75',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#abf0dc6720193f4ab9a278a95c495572d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#a2f8395d5782bc2895b99dde1a0a5ca20',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#a2bfd2c4264e14c4f64b737892c1f4f06',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#a458b855930bbc15ecb8cd6980db76490',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#a3911285f507951daf865d22e1dc2d7e9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#aaca84bf78edcf873560f46ba711426c4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#aba8c5712b7a8fce9f51ee8108dcb79f2',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#a07612c8115947993a4f0659814bdb991',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#aa98ce75bc9f2d7c2e1cc4436470c150e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#ab520b5026f77d9694c578169268d8f2b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#a1314b4ae40316edeea56f92f7e28410c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#a94054b18dcd5508cb296f050eafaac8c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#abb6922c94e0bc8151481e453e7fbd2f7',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#a3c116db6b09393487355778e5d0ba3e0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#a4f79732380b8f26101bbb5a5877b0d97',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#a9ceb5776015ed4c35b0dabca7fa8f4c5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#afe4fa4f0b7eca5152a57e65d0310bc97',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#a65bd36be5843d363a2eb37a79abc423e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#a0ecd7c3b11cae2bd14c04414fdf39d43',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#aa277c49633d92fd3ea4687ea0f01803e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#a7898e52d82e5ed49f5b81644674cccf6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#a54c18b3c9a1558b1f501088330c13c50',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#a2f12331e96d80708241cc08cea4b1fcf',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#ae682ad60acd60875e5499ef3ba62ba8d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#a7268248be04d72669a01dec69dc41c6a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#a97e63874df3289ce3294d46e2e016b05',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#ad8cd9718877e1b127bdbe2690289a634',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#af2cf38bc095adda2d396c87d8abcc41e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#a12157bf0f49e84150a01fe1696cd2517',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#ae901e5d211562a991d8626c0336b0d91',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#a384c7aad1eb1b9b6e688ece904ad37e8',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#af09531afc63fe34068a117835f5276e2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#a0c52e587496d1304d86d780ab48907bf',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#a96d20178b145f86f646dd54cc65a4689',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#ab25d29756405f0c6cd77f9374cbc4eb7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#a4592aa63ba08715f737b78de44450545',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#ae67b8281998dc6618d7137d6c900514d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#ae4d004ac86d256e60d311e9968760ace',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#a2f2f9af58b42f9000c6afc0ede01f437',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#a3fd0b06c245d1deda1dfd409ef777dbc',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#a3e4660a5830af64e9d350bb97c1e3a33',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#affb72e0053cfe9211f9e16b0cfadc0ac',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#ad6e87f8f718d28dac18c176645cc0177',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#ae3f634c3e17354623fb175e7ef20d939',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#aed47ce83acf75979b426dc241ae12149',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#a899e0ed06ca2d908cf92842a6c8145f1',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#afe83f8be103b8fff8e2ef9d56910ff68',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#a0b140fe99d998657ba70d37cb96981ae',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#a0f22910d204e8c3b3e5ff55c9480a2e0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#a670bd91b158c44cc933ee13f4083d850',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#a8fcac5f4fe8809ed79e52dd0b6cd3b33',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a98043b075d1f73a69bd0b19b1a24283e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a188fb685cd69453ab94f992332f523a9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a025f08f037ddf498278c429e09fd4d4a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a35f6a98383bf1ed951023b1fe432ed4c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a336a999e1b383c51b25841fa00f768d4',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a745a7f66bb6899e5071ee55e90f23368',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#af9137cfc1d9e0421323b78bf589c34fc',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a440eee4271eb5f61b204de4ec66054d2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#adf1cf7a1807aab50d346ef163c534c1d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#acde8c89a937e31cb98aa026b261cfe23',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#ab8e910b2c682642ac61185d1b155c5eb',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#ad1d9234d02b6be2ab2bdc5f4a8dc5701',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#acce157d175e9e72545e8784647a38511',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#a377694b1c0ce71b8d0c56077a904f7d7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#a8d2430849bd51fc5ad283d1a300cabba',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#a8ecc1609ac62272a2c0f5a1e1cddbed5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#ae8c1bfed5b951970a40f4028998d21fd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a73eba662cabf7a9761d2cf5d195206f4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#ab903a35e3bd981f1436d46179b87ecb9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a4074249c4919e43d534eb0904fa4693b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#ac2f871ccb0e37e363f7b979d923f944d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#ad39229402610f8c9069ea8a7e1c6a0ab',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a01c0225eea92b7b0403572335b1abc61',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a3490c2bf081c92095011640fb03961b5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#af95c4def12e4117e2d7bdc89b8fb0506',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a3f1c2f2aebc7a13ddade48d2a2f0301c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a023ead14754421961a4b473a3b1bb81c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a5ff2b2e15a95a8d176f99a8eebddf45d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a97cc1b7bcf350b322be5238011334085',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#aebd1c348edc2accec933a20abbf4ff2a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#af13fd6356fec61b096f429f666c4d50a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a8ae09f234561f1e415ef920bbf6eba22',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a6c1e5c2776f4209766c769243bf57894',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a96ca79bd9787eab9dfe57a09f61590db',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#af8966c1a682b91a466caa300f057d2cd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#aea22f0f456a89d61d1a066e7b363f59a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a78184f7b8b96c9fc9daa6d61c6bf8b32',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a933ef9f4d58e4ecad71988cd6f5ad537',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#af15cb1c5b6cddd5d3678e3cee0a6cefe',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a11b62696a1fcc6753a62e4b7b78987a9',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a58cc18641eaeee8eb587cb2a3726e85b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#ad7d432c589db7e87949a9d0ca5533b54',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a98fc1738f166a55809b2648796416db0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a0cb98e4afaf555388869ebe3242fc7d0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a2bcc4982507c7169f085b06d8bda77eb',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#ac58c7e73b10a41dc9f49d4e477b20fb2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#ac7cbe79ba3521a4bbd4c14a74fd6adff',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a0e895892d276833086475c0e7f1b7927',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#a1078e271f687367cefee7d0e75efe3d6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#acdc78be52effcf8cc2c910b822c3ee7a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#af8601ce12308ef84b4899504296ae6ce',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#a8b3df46fe1527fa468b07f9b7629420e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#a7fda08a8c83a3557857418ea43e4dea3',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#aa0685df0fb0a672d3d2237bd536db1b3',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#ac806eae9eee01106ea55ef146007dfe8',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#a06567b685179fac57c60d07bfc5596f9',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__warp_8cu.html#a8c8e7afc35b5a54e69b3826c35adf2de',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__warp_8cu.html#a621cee00cffc059f6e5dac1dae6c870a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__warp_8cu.html#a9cf51444fe766e08d86ec3b884680083',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__warp_8cu.html#a1f4c065ae0c477c9055f201ff1d77eeb',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#a2431e3a9f193cb26104acb7111bb16e7',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#ac73727b32e66331f5cfe9705c2bcf9da',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#ab17cf37109f61a98a1e67e278282d410',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#a56eec79755c7e031dac93d7fee216fcb',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#a9e80797bba1bde61de4e23580a123045',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#a2309228f9f01e4fcfd7620b415458f5a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#a1edce0d6c349a03501ea2777a101af79',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#a095215da51de608e36ba8292e72c72af',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__warp_8cu.html#ab671eaaed996f9f41eca1f557abae645',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__warp_8cu.html#a508ae4c79692f2664971272e30d3fc2e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__warp_8cu.html#a1ed3a6b528acb610a62f188de95ebc0d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__warp_8cu.html#a4fb277896c516d3421f917fbbbbc10e0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#ac34e39eb121e260238cc9a54f2d13a85',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#a761283af416d74a3f610cb64f134cbad',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#ab2bf567d2b9120f65832f9e8e227c3bb',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#af798849724e5b343ef0987b64245c41a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#af04ea470cdd8a07f331e1efbc90114d2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#ae6c1506493e5d8b4539080b206713dce',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#a0beafd356bf1cfb6ea68ff7e1bd2992b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#a356f3f696dd24ffa3fcf741fd8cd2ce7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#a36b2d055ae9089bfecce1598d5ee5734',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#a5765c206de6271ac6019a718fd7ad6b3',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#ae0f60eb17c2973c16120ac880fa1405c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#a33f0017811260350774433a6b81f85ea',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#a929b3395fb702cbf1354da769ca55637',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#a19a7ecd9eeedc4239cf1b987d3f4d15f',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#ab3795d6b83ee437c61880577c78b2273',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#afb2bcda34aa0401c61ef4fd5ebe0b090',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#a458b855930bbc15ecb8cd6980db76490',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#a3911285f507951daf865d22e1dc2d7e9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#aaca84bf78edcf873560f46ba711426c4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#aba8c5712b7a8fce9f51ee8108dcb79f2',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#a025db262738d28e0f6d0073da9eecc1c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#aa98ce75bc9f2d7c2e1cc4436470c150e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#ab520b5026f77d9694c578169268d8f2b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#a1314b4ae40316edeea56f92f7e28410c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#a94054b18dcd5508cb296f050eafaac8c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#abb6922c94e0bc8151481e453e7fbd2f7',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#a3c116db6b09393487355778e5d0ba3e0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#a4f79732380b8f26101bbb5a5877b0d97',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#ad312b70230d4098d8ac2747559c7f26d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#afe4fa4f0b7eca5152a57e65d0310bc97',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#a65bd36be5843d363a2eb37a79abc423e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#a0ecd7c3b11cae2bd14c04414fdf39d43',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#aa277c49633d92fd3ea4687ea0f01803e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a7898e52d82e5ed49f5b81644674cccf6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a54c18b3c9a1558b1f501088330c13c50',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a2f12331e96d80708241cc08cea4b1fcf',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a0b27ae9a200a1ece5394819d34ccab40',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a7268248be04d72669a01dec69dc41c6a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a97e63874df3289ce3294d46e2e016b05',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#ad8cd9718877e1b127bdbe2690289a634',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#af2cf38bc095adda2d396c87d8abcc41e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a12157bf0f49e84150a01fe1696cd2517',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#ae901e5d211562a991d8626c0336b0d91',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a384c7aad1eb1b9b6e688ece904ad37e8',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#ac9b1b580c02b691e732330917b4346b9',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a0c52e587496d1304d86d780ab48907bf',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a96d20178b145f86f646dd54cc65a4689',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#ab25d29756405f0c6cd77f9374cbc4eb7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#a4592aa63ba08715f737b78de44450545',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#ae67b8281998dc6618d7137d6c900514d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#ae4d004ac86d256e60d311e9968760ace',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#a2f2f9af58b42f9000c6afc0ede01f437',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#a234aa0426b89c62486c8f88fdd7722e8',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#a3e4660a5830af64e9d350bb97c1e3a33',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#affb72e0053cfe9211f9e16b0cfadc0ac',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#ad6e87f8f718d28dac18c176645cc0177',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#ae3f634c3e17354623fb175e7ef20d939',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#aed47ce83acf75979b426dc241ae12149',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#a899e0ed06ca2d908cf92842a6c8145f1',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#afe83f8be103b8fff8e2ef9d56910ff68',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#afa3433936e5b727b1211effc7414d937',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#a0f22910d204e8c3b3e5ff55c9480a2e0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#a670bd91b158c44cc933ee13f4083d850',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#a8fcac5f4fe8809ed79e52dd0b6cd3b33',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#aea8e820a7a4bc3fe64bb6c818542a3aa',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#ad64afc5ea3a238f14048b1d678f617be',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#ac8f9bf44e289b97fd4b68b3ac86e5fa2',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#a3cf034bfeabf17e2c02ef5eff0e39d27',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#ae9bd7c9d24668aa08267f29bcc8d579b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#ac2f4c84a8e13733979d8c8eb160d8ab6',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#a9668523612cb73bdea52956fff1a645d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#a36dbb2ed81d41998cd4ddd239f6e18ff',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#a47b1f486724dfb5ef0c59660725ba49e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#af72ccb394ff0a9c8bad2415b26124ee8',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#af37b23a1376bb72a6936967e93403d29',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#a839068bef278b0cf5184340361f2db61',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#af3b0b53e46e934265545fef179bc4a42',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#a3be6a3a67f391545b95c03cdfeabab49',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#a9e740e6e3df02da3c05d0dfd940a2793',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#acff100b57110e4d629c786c3535bf208',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a2992f3c3797e58777a7f7d6aff063137',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a88eb41bb17cb58eaf37c6e5cc0ae1bfe',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#abab241cb0bbbdda5a3d240feff95de96',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a837a847bd0e24c4c323f60f3cd49bb93',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#ad25cc23c713ce4d2ce9a057d23d66b8c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#aa8a6cd9058bac3b6775b6057a8b0beb2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a28402ef2cf3a1b34fdadac6a6ef06adf',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a33595ad9426cb537c6e4e9c2bb0d1cfc',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a28fd1bfeb870e4192c831675880469bc',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#ab570073ba2f2dc988643433eb9ee56e3',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a2543bb59812617ba91ec36256ea579b5',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#ae0bf41c3a1fa62e4aacaf4dd6e3ba1ac',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a60837a52bb429e86372390ab093b5c3f',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a6a6e51dcbe4f354b395c5ef3a8632e9b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#afcd6a68c14971422bbcac044bc2e5fe7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a0e9aa9538f85f1a20881b99a619ff138',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#a021fc973b5fc3d624856c3095ea0d8c5',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#a1a126c0d3c9315985228744121d10f65',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#aba4fc1bf5159b001bdbeaed09bac28cf',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#a87b68faad6789ef38e5ee96bdf0adadb',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#abb70eda92401330f9c430e33657f5390',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#accc2086d06273c59409c74b598e17066',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#a331c0180e8dc65d864006a18ae10f3e7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#a1da0aa99bcd3a3a2ad540eaba284bd08',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#a46d814dea7a25a249b9e0fc0c82023ed',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#a19120fdc3ff0a026755d36ddb40ff43b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#a1d111c3d803b0ed234aec8f5604bec87',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#a7f28593d442951ae04e27670c892fdc9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#a80acf7650ec2712ea7d51f7d5156fc39',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#a7643d87610f1fd256807566fcae51c36',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#ab9a28b117d8d2c802b31c3850cebf7ef',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#a53612aafa2641dc1c70fc11355c354c5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#a459aacd66b48c479d5773c84d129086d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#aab8efedfe2eecb8e722290e8670b57be',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#aef14d493a157796b5d5b3708471dd5f9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#a1495ee920385d2c17517f402e4f2f1d3',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#a13fd2498aed38e9bc488bba7aed3c70c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#ae1896638d5d062dd4fdb76ea25fa25ad',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#aff46c2a59e01f53a86a7b0d79a618a13',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#a0bf2b9756ea833bf245d6fd93a68bba2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__warp_8cu.html#a07fe51377b6ac8933fda5657824dfa00',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__warp_8cu.html#a0424cc55d1baf826ec4665dc699c0ee8',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__warp_8cu.html#abe7a518fe77140a9f84658b9be73ca57',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__warp_8cu.html#a93379bd0b52108c09ce0c6012c1a0bc4',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#a9efa56f919a034ad1c2eb4339babfacd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#af1b7ece649e9d0dbeb4a372364cfbf54',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#a5be30952d02614260f81e9b29d17f767',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#a6c887e7cd209eff2be60616a0eb2ea9f',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#aa6dac18027510aba99d797d8c340fa0c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#a1aa0bffadd8de61d9327613f1b0c3d8d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#a805ef69c6e5b5bbd4a5d70b053dc8940',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#a67e2d754aeb8030c70dfdf94358cac76',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__warp_8cu.html#ab111a2635d39331e5dde581b2cf5ab40',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__warp_8cu.html#ac42deea1306a7165392cc02c0c962381',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__warp_8cu.html#a216facff7aab2092d3300f52f73f441c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__warp_8cu.html#a9a65221171b1118ec811d883a600b7eb',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#a2dcf33b730969fab9d8d9e13f5812500',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#a1628e1fb812ec5d70a2a3701145ae3dd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#a3af1e6fa25253eb084459b3d13ebf58a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#a5731f2347a6fc9cfce399131b7535c72',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#a1916dad21c1174ed094bf7cb9990674f',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#abcac665cc8837bd07d64ee1f1d22c9bd',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#a1627d1331758cf0987f80b531597de96',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#aa2f1fe9cdd926d486017e9c9e3ee401e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#a58dd95b539386ce0756417ffa7e3c675',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#a4ac5aa9e7a97b988f21d79f2c77e8a42',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#a6ad697b6cced262fbf9c5329af882295',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#a946e1b9e34decc6ef732c17c06eaf67b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#a99d2945d0e14c762a262971ad5cdddca',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#ad3382f93d63430516e0fa4ee3dfcd35b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#a88e9b91386946c328e4ea9cd1074af16',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#a376fefbb04f4e4d081447881d6aa3ca7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#a458b855930bbc15ecb8cd6980db76490',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#a3911285f507951daf865d22e1dc2d7e9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#aaca84bf78edcf873560f46ba711426c4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#aba8c5712b7a8fce9f51ee8108dcb79f2',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#a11b57ed4691d1c773211ef5481a6dd02',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#aa98ce75bc9f2d7c2e1cc4436470c150e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#ab520b5026f77d9694c578169268d8f2b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#a1314b4ae40316edeea56f92f7e28410c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#a94054b18dcd5508cb296f050eafaac8c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#abb6922c94e0bc8151481e453e7fbd2f7',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#a3c116db6b09393487355778e5d0ba3e0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#a4f79732380b8f26101bbb5a5877b0d97',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#a0e70fccd2a1e2a9e2135f0b38f7fb8b3',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#afe4fa4f0b7eca5152a57e65d0310bc97',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#a65bd36be5843d363a2eb37a79abc423e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#a0ecd7c3b11cae2bd14c04414fdf39d43',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#aa277c49633d92fd3ea4687ea0f01803e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#a7898e52d82e5ed49f5b81644674cccf6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#a54c18b3c9a1558b1f501088330c13c50',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#a2f12331e96d80708241cc08cea4b1fcf',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#a2b831b47546fedc2c25d2ade8b88b756',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#a7268248be04d72669a01dec69dc41c6a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#a97e63874df3289ce3294d46e2e016b05',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#ad8cd9718877e1b127bdbe2690289a634',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#af2cf38bc095adda2d396c87d8abcc41e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#a12157bf0f49e84150a01fe1696cd2517',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#ae901e5d211562a991d8626c0336b0d91',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#a384c7aad1eb1b9b6e688ece904ad37e8',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#a189144e6ce32a982c752160cfb103ec8',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#a0c52e587496d1304d86d780ab48907bf',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#a96d20178b145f86f646dd54cc65a4689',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#ab25d29756405f0c6cd77f9374cbc4eb7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#a4592aa63ba08715f737b78de44450545',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#ae67b8281998dc6618d7137d6c900514d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#ae4d004ac86d256e60d311e9968760ace',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#a2f2f9af58b42f9000c6afc0ede01f437',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#af70ed3aa3b3e9f4ef10054777ea73ab1',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#a3e4660a5830af64e9d350bb97c1e3a33',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#affb72e0053cfe9211f9e16b0cfadc0ac',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#ad6e87f8f718d28dac18c176645cc0177',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#ae3f634c3e17354623fb175e7ef20d939',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#aed47ce83acf75979b426dc241ae12149',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#a899e0ed06ca2d908cf92842a6c8145f1',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#afe83f8be103b8fff8e2ef9d56910ff68',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#abb5bc6565be4b9b6cc47cb4ca0d02a12',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#a0f22910d204e8c3b3e5ff55c9480a2e0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#a670bd91b158c44cc933ee13f4083d850',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#a8fcac5f4fe8809ed79e52dd0b6cd3b33',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#a458b855930bbc15ecb8cd6980db76490',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#a3911285f507951daf865d22e1dc2d7e9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#aaca84bf78edcf873560f46ba711426c4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#aba8c5712b7a8fce9f51ee8108dcb79f2',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#a1752a413ef2e5ee8694cbed313bd3c9b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#aa98ce75bc9f2d7c2e1cc4436470c150e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#ab520b5026f77d9694c578169268d8f2b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#a1314b4ae40316edeea56f92f7e28410c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#a94054b18dcd5508cb296f050eafaac8c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#abb6922c94e0bc8151481e453e7fbd2f7',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#a3c116db6b09393487355778e5d0ba3e0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#a4f79732380b8f26101bbb5a5877b0d97',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#aafa7d80ed4b830a47066853afca5adb8',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#afe4fa4f0b7eca5152a57e65d0310bc97',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#a65bd36be5843d363a2eb37a79abc423e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#a0ecd7c3b11cae2bd14c04414fdf39d43',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#aa277c49633d92fd3ea4687ea0f01803e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a7898e52d82e5ed49f5b81644674cccf6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a54c18b3c9a1558b1f501088330c13c50',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a2f12331e96d80708241cc08cea4b1fcf',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#ab0ee6537f36eac8a7a5af1623b9034a1',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a7268248be04d72669a01dec69dc41c6a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a97e63874df3289ce3294d46e2e016b05',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#ad8cd9718877e1b127bdbe2690289a634',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#af2cf38bc095adda2d396c87d8abcc41e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a12157bf0f49e84150a01fe1696cd2517',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#ae901e5d211562a991d8626c0336b0d91',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a384c7aad1eb1b9b6e688ece904ad37e8',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a12afc30313df2164ef2c299b47d3762f',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a0c52e587496d1304d86d780ab48907bf',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a96d20178b145f86f646dd54cc65a4689',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#ab25d29756405f0c6cd77f9374cbc4eb7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#a4592aa63ba08715f737b78de44450545',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#ae67b8281998dc6618d7137d6c900514d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#ae4d004ac86d256e60d311e9968760ace',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#a2f2f9af58b42f9000c6afc0ede01f437',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#ab41d9b72247799b42c181dc59e842a2f',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#a3e4660a5830af64e9d350bb97c1e3a33',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#affb72e0053cfe9211f9e16b0cfadc0ac',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#ad6e87f8f718d28dac18c176645cc0177',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#ae3f634c3e17354623fb175e7ef20d939',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#aed47ce83acf75979b426dc241ae12149',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#a899e0ed06ca2d908cf92842a6c8145f1',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#afe83f8be103b8fff8e2ef9d56910ff68',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#a2aa1026f9d94c927bfdc7d12f23f8626',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#a0f22910d204e8c3b3e5ff55c9480a2e0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#a670bd91b158c44cc933ee13f4083d850',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#a8fcac5f4fe8809ed79e52dd0b6cd3b33',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#a015143a1cf9641909ef5739492836ab9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#aac73098f12c44ace7bd0c6ed29d1acb5',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#adde42935e2ebd0c4cbfb5a925c603d3c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#ac9113b72a8883bfe52a840eaf6bf0bcc',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#a1a4b7c2d1fb4fe724a9ddcefe4a3ad96',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#a4d4d95ee827c360821c77e0f6a5b533c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#ad6b46d8dad6badf1a2e13000e0809359',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#a69800c08002e6a964629da3691cfa699',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#ac0a2c283925ea0172a022b44ec4420ab',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#a6ec6ef39c438b48fc5ff99850376c2e2',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#a68d39c92f33a5fb23bf494df10381aa7',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#aa9a73b585d5c585477687c3b42859fbf',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#a502058fc25fa19bb0cd2e7cfa440c82f',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#a157d6cc11ad0a2f4127709df3181d056',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#af6c8d616d0e8c2d6738c38fece880943',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#aa47b1b3531724ee008b8a88a913375d4',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a0b393ddcfa07501c936c09103420a327',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#ac99cf2df0002f1359da1a71821a5d7a6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a16c1dd81db1a38927c5a39968b2a2047',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#adea97673f55b5d43fb1091e7cb082cae',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#ac80ea89a8a915ac8a1a6eaee9bd3a921',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a4b2cdd16081fdd55ef997fcba11943b3',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a929c9944210d7078c0bcc89ae2ad2239',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a942425624762de23778b8ea3b8da1267',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a006b3b6fd358ff41f9dad5c39f2cb330',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a81cb91d9b5c6ba53ca66e62ad21265d6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#ac028b85ab4d730883ee7b170a11039da',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a73f35746d0a9bed1751b964c07d2c3b6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#ae5efcbb0aa7b60c29535ac9c49bbb00a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a3d28eecf8be5cfcbcd71fef6322ef6df',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a0e96ba84ab91aea304a2e6ac78eb1fa2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a96833312f1cb3bd4067a854dc1a85d9c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#a1ce149ce2e815f85f42f779e853b6384',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#a5e2d57ad35649098aac904f8acec4d7b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#ac8d5805872473e761a71634add6ae7b8',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#a873c28f5a06ab6135240b18b23aa17d5',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#a86ab9a70fb4459793418ac95f6844494',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#aa1d6f03c27aea0bad56e3d38003ffda0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#a0e3cae02bd4631c5b65507b91c500606',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#a7929e1f87db6d3d72cae3804c1aafef0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#a0d5f1eb18d7aaf74fabc0d63a215062e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#ae319b79d484f9cfb10ddf935cf3dce8c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#a50b88aba0d96371aba370d9894857aff',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#a24d7b65f902789f50e1a0fdc3c72da0e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#aba27e610941b3c6a9520a14a567022dd',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#a2922d0a81b0f1a4427fde265b05427bb',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#a261934c69234b20a2f19650fa88e4cd0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#a60c7d08b38c83f34ba87438440f950e8',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#ae3f8662de26a86a0e1e1612804f49b52',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#a3530c5b60b4dc3bc1fd5f0af31e32361',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#ab4098b3e8ab8552ec947cbb52de77a0c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#abc6e9b570bfaac7771adbc13408463be',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#ab2bfbf20e506af2d9ee18af83b527e5e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#aaccb85f565c3b1d5a36dbf413fe05ec4',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#a799bc0e6eb13b05b038c910b7a650bd8',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#ac2e2b47b2c51943f4ff8fabdfb57f270',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#a026a0932fcb72fcf66460486db323e9f',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#a99db511954f4e0ced515daf371cda8da',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#a6baf7387932e58b5a570e01ea0fb2638',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#a1f859731d1effb901df0012fbdf35756',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#a3451cc31def5c831a428e221f4713d57',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#a55aad527eb01f16edb9ec021704e4a60',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#a494688d7ae0362eb0e5aacbc0ecf19c7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#a95227f34f2ab6c04dcaadd41e1886304',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#a62e5730bd70e4665352946a17b3fd18a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#a91027c49f28b9c30a8fe20c5ede43b4f',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#af003bb8591ecd9b6b755807f601cbde6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#af9fea1c8c674df3acb9e76cafe6518fe',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#a5f06095eeec3319c0936d2a99a095054',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#a30c5340455dedcd1684d0858738d7c9d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#ab753932a15b63161c3d38c683e2d290d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#a874c752c07a36fb38f9476fe78a46735',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#ab2c641791d87abe8e19dcee2b3726819',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#a7e755382f2ce1290c7c3357ecc025b78',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#a30b2c51012735e4ebe919dba89c4d8cd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#a365c2eb2cec39bb504cdae18934b89c3',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#a004e2b2b3ffa5c4c402b2f56fee16ce6',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#ae3be8dde6dfb4fe3ba1a815b319a2925',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#ac5817730d59e634a76e7aafce41aaf26',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#a93d410b588239e17ac8e10d7d6e291a0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#a4172fb110abe23887cdaf0536ef2bcaa',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#a03929e871fb455cace7f23efc0d24583',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#a468a45d6ca5a19247698337fc33f435d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#a184065748160f0c7788467d39b27f5d1',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#a6a9022b14995bf97b8f204dc404e1e8d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#aea5128dbea65fac0ceb8b42749f74099',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#a8f1a8a90b130ae668e3b6b7947c6c4f5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#a2e889d0595ab0362613d58e7ff8960b7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#a6014caa4aca0c9e7b583e71900a0a48c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#a90005cd7c4e9aae8498fd1d938983179',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#a384fb2660e3cb8a46cf1154d5b45bf2e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#a297213250dad534fbb5b3654e854f1a4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#a012a4e2ee1f52bb243e5388eec3e8a5c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#a0c3bd53d12b516a80478d5a9017a684b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#ab20ec4fe16b91aae91640b2dd5452ed5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#a843389bf3c054d1a20a6115d47d99cf7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#ac79c384938b7bffef4943090b602ba5e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#aeac8eff5cada3efbb3674213a5f42bc9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#afbd549d3981439a47fb0c3811e9eacf4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#a5f6a21f619bb88465b760c5556fe6f1b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#ab03dcc766f91725239b7737cee2b194d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#a2b9bc69930f735395605b0b91203d7a5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#a6d5664cd6fa11c72a6de5f652e0aec5d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#af412ff33330b1349cbf7c2a33e58f9a7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#abcba604787cbdb187f05ab27324d67f7',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#a33f05c8d5a2149e88f0c5a0a446357c2',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#ad387d04e602a3a29f7b44eaeb1edb9fa',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#a9cd29bb0dd406092916c5eb0605aaf0d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#a7b5add21eacc916018bb3b4e0fd96436',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#a5da06cf5b2fca41ca811bae68efd4049',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#a853a5012db3ca2150440460e10d486ae',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#af1c9033199b40adc628848b21f60b950',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#a0d5d4738a27dacbbecc699b0297a6331',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#a260b636a6d13f307a286c4b24b47a1cc',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#a05a2693fb5198654434e63ef4a07981e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#ad178df90f04b6ef9c3c907c699042d8e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#a04d2d84d9856aa9de1f36e1813d4c172',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#a0c158805e4537d8825326a3ecddf9c9c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#ac6a60f786cbc800c9b675f386c1014ab',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#a91f984a560c40dcae1abbb2391fa2fda',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#a6c5b8de0acb5391f4dc4172ce5ca094e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#afb504ea4eac563c64b42343e986a7847',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#a3c2fb3ecac9e0bd458fbd1023025d5d5',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#a6dfe39e1df2bced46b2e0991e3435be9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#aed82b2485ec72bfc56b2fae686d062f0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#ac3a5c0e1adaae87917f2645e6a2afa46',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#a54694cb47dc38390f1b301aa039cb31d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#a9c2f7f4369735aa317a88c819b378f43',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a98043b075d1f73a69bd0b19b1a24283e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a188fb685cd69453ab94f992332f523a9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a025f08f037ddf498278c429e09fd4d4a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a35f6a98383bf1ed951023b1fe432ed4c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#ae079dac6052edf65f8a39b4fd9de7c70',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a745a7f66bb6899e5071ee55e90f23368',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#af9137cfc1d9e0421323b78bf589c34fc',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a440eee4271eb5f61b204de4ec66054d2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#adf1cf7a1807aab50d346ef163c534c1d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#acde8c89a937e31cb98aa026b261cfe23',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#ab8e910b2c682642ac61185d1b155c5eb',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#ad1d9234d02b6be2ab2bdc5f4a8dc5701',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#ae5465342deb9e71765693c8929b5f475',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#a377694b1c0ce71b8d0c56077a904f7d7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#a8d2430849bd51fc5ad283d1a300cabba',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#a8ecc1609ac62272a2c0f5a1e1cddbed5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#ae8c1bfed5b951970a40f4028998d21fd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a73eba662cabf7a9761d2cf5d195206f4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#ab903a35e3bd981f1436d46179b87ecb9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a4074249c4919e43d534eb0904fa4693b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a77fbe03e0ff353a2ebe490cf97f0c353',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#ad39229402610f8c9069ea8a7e1c6a0ab',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a01c0225eea92b7b0403572335b1abc61',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a3490c2bf081c92095011640fb03961b5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#af95c4def12e4117e2d7bdc89b8fb0506',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a3f1c2f2aebc7a13ddade48d2a2f0301c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a023ead14754421961a4b473a3b1bb81c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a5ff2b2e15a95a8d176f99a8eebddf45d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a4f470748a75cfc59c5c7a0cb577289f2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#aebd1c348edc2accec933a20abbf4ff2a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#af13fd6356fec61b096f429f666c4d50a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a8ae09f234561f1e415ef920bbf6eba22',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a6c1e5c2776f4209766c769243bf57894',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a96ca79bd9787eab9dfe57a09f61590db',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#af8966c1a682b91a466caa300f057d2cd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#aea22f0f456a89d61d1a066e7b363f59a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a22292529eb85249ba3bec7be758eebee',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a933ef9f4d58e4ecad71988cd6f5ad537',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#af15cb1c5b6cddd5d3678e3cee0a6cefe',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a11b62696a1fcc6753a62e4b7b78987a9',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a58cc18641eaeee8eb587cb2a3726e85b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#ad7d432c589db7e87949a9d0ca5533b54',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a98fc1738f166a55809b2648796416db0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a0cb98e4afaf555388869ebe3242fc7d0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#ac0e36eb9e678f52e0561366229ecc4bf',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#ac58c7e73b10a41dc9f49d4e477b20fb2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#ac7cbe79ba3521a4bbd4c14a74fd6adff',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a0e895892d276833086475c0e7f1b7927',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#affa3d280e56d69dbe39ea3bda0bcba6e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#a7f2d784a0f6604d457a71d725eca24ef',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#abd72df618308d6a739f91188cc5a1e91',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#a5db669968a840fd6cd68feb612d416de',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#a94192c3fad25107220bf7cf718abdfed',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#a0dadc1a7dd7578c22f5d239047bf7794',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#a48f4d0c7f7758b5149c9d96abb61354d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#a173df29f55015b4b4d8c9cdda6986823',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#aed68dca4d92a97e556d3073cab88a18f',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#a23c47f9e7c8f8a011e9a2d3778e2a65b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#a856011203b19087ab6f1eebb7a8f18dc',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#a69682ffbf2a367fa7e6d25edd9cf1218',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#a12192a01089a95a93f5a384e9faaa312',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#a58f3e7232aae5283c177ee7305d1bede',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#a7412bb61fd123be30b935508b1839d66',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#afd57c62802e581a57d2e9daa52b09e4b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a5d3923934afd4c41777f94dd36798bb8',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#ac0c8d3772833c3ef461a44cabc9cda70',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a5b2d60d4092d3af5e898446d1ffc3282',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#aed55f18c1fecec6d6de78577918449a0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a240624068305d411db3cdece269f6a2f',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a6854bab8c0d96882e4f9f980880531bc',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a9ab47f5d78d1d005e9f8784e812589b0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#af7d6e1a2bc0d32d0273140358b977b4e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#ad94f32bbc65499df3140ee3a12f12dbc',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a84246813bd816d0adfa4751b327dbfa6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a90ffe039f52ddd5cf5e1108e7116b612',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#af129bcfb9d742a9a531ee4c3324bdb9a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#add33ba4596f143bb11a12a3508c0fb32',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a7b80756a8fab65071212121bf535f2d3',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#aef2fbd1a40bded32e9118172ea588823',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#aabf9c6be454bf78678fc82ba87ed2b56',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#a2e2219247d875dadcb571833d8282ca4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#a6940aede0efad4a0cca521cfdcec433b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#ac7273b842f26b655461dfe827e4bc669',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#aaea9d9291155e312439e673a39970cc3',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#a45a1b05c68acf892f30f7ee837bb5aac',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#a526a3a91d4d22f8f4b8b25d52ea7539c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#ae9fc0a17625be30b2c3e94857e45e660',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#a7d36afee5962e7c2e645ed580a9293d7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#af6ea5271fc0e7434bb952837a4ec992c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#adf897a86ff3ef489f638c5d6cd604fb7',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#a6562c95418573901d3dd3e933fdb1798',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#a2da0fed4926ad614ee0554b8f818854d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#a29dc67ef45e2c108c079066771ca4b15',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#a908dadafc7b1c847ac07f402090b784c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#a861cd39a27db6459d3d308938724a605',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#aceb4cd33e669bb98a7d191fb45221a80',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#a9a42f11861e28ce77032f8047e83ea11',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#a8bdc52848ae2ccea30492b4414adb034',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#a0c819af30fed201203e68ceda2eca173',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#a0720d53a4c9644a99b5cbe9e245dc3e9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#af0d4736eed64c8bbf3a20923bb9c29f9',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#ad54cfe3bdecfc6441753596772402ca3',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#a2503c0d4c5e56ba15bfb7df317dda0ac',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#a6d8a94d5bd394aab6b93267e3f0f2673',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#a5917f6544b279539f51ba07a7d4d5ca5',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#a160f8f69b25890024d8d91dd87bbba82',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#a64537991cc98a52cb2bd884dbcc7bebc',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#ae0574dfcf396c58dc8863401720dacb5',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#a6bb05de78f7804f75e027524d191e5da',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#a64b45cd53c38d53cedf6f4d66afd11f5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#a9d44bcd45f9e02788aecbf226dbeb110',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#a1d850e642c167b5e60a73c88a47f7f16',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#adae5d509289eae4626e7cc6eda18efbb',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#aebc5dd156def696b75e9590fdd7e44f4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a4e775aca46c2cf5dfe37c97a0c320eef',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#abff5c0eb0f3d6a4dda6f6a5f51450dad',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a7cfbc77648395dd0be255b6c2a04797e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#aedc7a4a2ea94e6294c49780531ce8562',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a5d38c3b8f12784860c0d0219684a22f2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a5ba8347d410dea8ce7952d7e5674a053',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#ad491955b35ee3fb84ecdbc35426aa9c6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#ada25cfa4c47b6ce54c00b842e414e5cb',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#af4e0c8de103d5b95b3930d72723dedde',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#aa7f8ec16263de0ec18ba44144f3f6409',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#ab49ef540e21a06c9366b7a4b1b643855',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#aab891f89faefe34faf30508569d63250',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a9a34cc24631cc7850723f21d44ac9bb3',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a0a8f9dbd0e03e001dc43109c9b58edca',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#a8f1dc526305df11d57d5151eb78864fd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#acb04bf74aa1979914c837887050094ee',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#a79789b33f1e3e7e2f3908b939ae1e44c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#a3b929350b08473bf7001fb6e8d38f64d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#a027461b35f0b0e8c2245ef80575fe911',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#a81afc9990a7d79a97ddf8ee0bb84f62b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#a98e60157f32325eabb7ce026f700e32f',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#a6050e98a82b09a3401ba1bcefb21abdc',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#acb1714d604a523f5860b4c87e669c715',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#ab83eacf00da6299593ee678a4b1e4615',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#a4682eb9fbf137eb4577349e11559ecd0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#ab5dafd4069aae36629ecb34e3975ea6f',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#aa223add1301373e53e5b0ac08530a54e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#a46cd5ec5d4f141fe5dcce4a8b22a1aff',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#a8aee7d2ca70c048a87381106420a93b3',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#ac111217914f0bc07a2ec19cf00f46b52',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#a1a5b4fc1cd662532df45be95fae00e34',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#ae8e719bacb730ff6f6f24b072264fece',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#adc76a3911b3c75253490fa732520c59d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#a0246985d6062109ceb9d0a316e236be9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#ad84b0786ecfc63b8b6b3a1383dbfe719',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#aa5cf42df68862104a475751de18f2d7d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#ad32698d0cc220a69f7ffe6cf58fe5389',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#af6476f9b0a8e869bb5f1fbc1c39714ca',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#ac2742fc3885cf36bb8ac4d7d4c24587f',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#a7b4db1681ed1be00464c3420ff441efa',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#a050fc99733adcb785414bd0c401d02e0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#ae79041a9602287ab549b549edc4f5040',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#a981ec80c80a0ca3713a250bca8dcfd2d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#a8e5539e49116fc0d95e74b70fff7eb96',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#af896552004ed24a4f6289bd6321b95b3',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#a153506803483f7484c6fc69a32b06b26',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#aae8702725dfe41086ad78bb86764b34c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#ad07aac3191ff79c34b89afd9b89305ad',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#adf78b0255c91deececdee2d30eb7f2ae',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#a3860c9b9fc99bf6f1e19426e6d95f473',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#a57427b04d21bb9e1302a85d709f94e02',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#aaa3e935211a7fd38509a279705c5e5d7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#a7ee0552285c492916b1c76b31630d3c2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#a4ef721cf4ccbf7faeaad926427c279f7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#a0ab8a7e2535ae5a3f056f529bcb1071a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#a573d877b87f31127000da9bc22ad74f2',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#afcba725b1740e61675c5148dd9523082',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#a027faf7fa459ca567059607e155a1546',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#ac7a033e21d13e0bd2a2268a4086c9770',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#ab065602b705ef3209e6d4de9f8dc0bb2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#a4342e36e81769a5d3992a7c557cb4e0e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#a88f0e0bc690728b1e246b8248e9ec6e5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#afb628f9293807019a85f62216802fb27',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#a4cfe4909493e5c6c0b3272b407756da5',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#a54d7f4614b27377a702368d9be00913a',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#a8a9dfc0b7289bfa8ee20c3a9c89a1382',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#a5e36f01e2e5309c8de784ae9cf8b6995',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#ae45afbdb3f525626eeb8ec0c6be41f24',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#a067da40e6e91e38bb46e13bab2169087',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#a9ddc1dda2eb92f1166514ddb7da1bbc4',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#a031019a7e2638f18e08649bd6c279449',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#a63685bd7126cdab9a0d8e4046c3e150c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#a623f10d789c87a085d7c83199ac22f55',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#ab30ac9e21532c639d357440a7edfc7eb',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#a3dfcd6c505c277727fdc5a5efd1f21d2',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#ab0581905b4247bac67216a78dfb722c0',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#aa21811af7885f72fe15a805872bd5a22',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#a14fb66cd776fba62200b634101140f86',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#aff669225134b913ac286c1517e039727',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#a87b17b201934f903fd2f193ac6a71629',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#abdf19a2e8c33cb0148de770a95bd662c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#a469aee03c0d8fde04842d8747ef880bb',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#a3048e1d82f672e144f218a9bc1f02bba',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#ad6d957e4c772be151a4b6c0937b71e2c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#a4199338fdc51c5f831d168e63d783674',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#a83fe0c13753b93fbe0b623e8bc652721',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#aea91359dc803899d522a74120b6d587c',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a2cae6fdef6f90d98293e7e6f2eda2138',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a254bf4fa577be3f3edb7bf1dc9339295',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#ad9e0f41f3ef8ca4cd788578980ccd083',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a0277f9514d8b9668290fe078c5ad155b',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a5da4584d7767e8c488e4e29780c3aadc',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a5ea57f49a9d1efbc601b256ec5d13107',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a93ac400107836c0de2730e3a54959ed6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a0e17c23a544e4b4ebaf07d215ece084f',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a51028935eee6951c1298eb5d7092d650',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#adfec29a5a30407f3b60408b80419baac',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a0566524005bbfc2c27eac06fe4ebe955',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a568c659233485f309357ee134d1b748f',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#ae5b51047bc4a0305b636290e7ddb278a',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a3262dbb14f77bf739b020bdf79075384',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a0f68f3297710141bc57e677b3d0587ce',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a4876b4f94d323f090efef96432fc27a0',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a1c7d35447c029aba8ddce8e9532a8d82',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#ad69828fa35cf5312392a5791a435ac3f',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a0209c0fd938024beedd0716523eaa090',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a9461df0509fec5e584eaa309acb4e0ea',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a95cb4e2cdf49f5f5ba2f9a2acf3ff32d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#af585b19bb7928041ac8b70d56c7d6f1a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a8c3130a42a235a75553eaf160ac657dc',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a8bd2c5adf9e33805340e4717cd9f0617',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a3d0b2d3bd9c920851a41c71817e28378',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a436f00b93c571aa3159b822122e4e781',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a17466673ca73e70a4887999d2955aaf0',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#aea632259492fcd4ba0011382bee2beea',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a4458fec1221cc0c7df2c1ef8bef422db',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#aaf16de5ee78d9de99a703cdbe61255c0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a8835d1cabbeed24c96e827473542eea4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a931774f9fe3e608ee4b30ec8e200049b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a2463ea08e2eade6932bdc3b08dbf3f4b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#abdcfee895dc0dbe60f3899820e3faef6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#afdce4c5ff535f039b96169a5441d66f6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#aefd9621d81effbb756e78929daae8517',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#a91968527cd3a341bbc8777ae41190d41',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#ac9f0e82189d5fd39e1aed1f89eb7336d',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#a61ff627d873a281ecf852f217e944c4c',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#afe7f0771d29a6a9ffd897e23dd341d7c',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#ad96d48c6eacdc0589531c48472f370dd',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#a82d9c13b59a58a367c962ccdaa95bc01',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#ab46e47b9451a78d43c7c23cf897e9445',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#af359f9c87918957f14c927e52e0d719c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#af47656d04bdce098caf47b331b74fe2e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#a43cd667ed17b8606af1dd1f5027311a4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#a99087a69215e3ecfff828e64866fb490',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#ad6141ba5c93e5aea872230ecd4a0d878',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a7bb186f4330ddb51696533419c414b5a',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#acf2c859f1eecda3ddf9ec37754afe3e4',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a7745e0ade9aa98a7050c6a76c59e88bc',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a51039fcb60604faf673a12fc9962de52',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a39c7a917fc74982eb89a2a6770d0be92',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a094950f659c8dd934ea88348ed79dd2e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#af4df56b4d05360a4cc547377c34a79dc',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a69613a0e40ad1ddb76bcf494c6eba437',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a039318e8b0ec66d135fcd3f9b16a4228',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#ab89613a21534acb8fe6c89a570467067',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a3947e811d4918cac9bd3e70fcce80126',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu']]] ]; diff --git a/search/all_c.js b/search/all_c.js index 1e64f383c..c4d1a28d5 100644 --- a/search/all_c.js +++ b/search/all_c.js @@ -1,5 +1,75 @@ var searchData= [ - ['operators_0',['operators',['../group__input-combine.html',1,'Combine Input Operators'],['../group__permute-pooled-embs-cpu.html',1,'CPU Permutation Operators'],['../group__cumem-utils.html',1,'CUDA Memorty Operators'],['../group__table-batched-embed-cuda.html',1,'CUDA Operators'],['../group__permute-pooled-embs-gpu.html',1,'CUDA Permutation Operators'],['../group__embedding-cpu.html',1,'Embedding CPU Operators'],['../group__embedding-cuda.html',1,'Embedding CUDA Operators'],['../group__jagged-tensor-ops-cuda.html',1,'Jagged Tensor CUDA Operators'],['../group__jagged-tensor-ops-cpu.html',1,'Jagged Tensor Operators'],['../group__layout-transform-cpu.html',1,'Layout Transformation CPU Operators'],['../group__layout-transform-cuda.html',1,'Layout Transformation CUDA Operators'],['../group__merge-pooled-emb.html',1,'Merge Operators'],['../group__quantize-data-cpu.html',1,'Quantize Data CPU Operators'],['../group__sparse-data-cpu.html',1,'Sparse Data CPU Operators'],['../group__sparse-data-cuda.html',1,'Sparse Data CUDA Operators']]], - ['operators_20for_20cuda_1',['Quantization Operators for CUDA',['../group__quantize-ops-cuda.html',1,'']]] + ['l_0',['L',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a67824ecf84f5816f07b74fa956bdbcd2',1,'L: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a67824ecf84f5816f07b74fa956bdbcd2',1,'L: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../namespacefbgemm__gpu.html#a71a77dfc9561ca59031082dfd57dd5ca',1,'fbgemm_gpu::L']]], + ['launch_5fauc_5fkernel_1',['LAUNCH_AUC_KERNEL',['../metric__ops_8cu.html#af8d70229cb61aff5f2f2e8f1abb10440',1,'metric_ops.cu']]], + ['launch_5findex_5fselect_2',['LAUNCH_INDEX_SELECT',['../sparse__index__select_8cu.html#a501f87ecefcbe28091d9a1c48499d3f6',1,'sparse_index_select.cu']]], + ['launch_5fkernel_3',['LAUNCH_KERNEL',['../keyed__jagged__index__select__dim1_8cu.html#a2ffb148e7bce97b5375e01ac265cc967',1,'keyed_jagged_index_select_dim1.cu']]], + ['layout_20transformation_20cpu_20operators_4',['Layout Transformation CPU Operators',['../group__layout-transform-cpu.html',1,'']]], + ['layout_20transformation_20cuda_20operators_5',['Layout Transformation CUDA Operators',['../group__layout-transform-cuda.html',1,'']]], + ['layout_5ftransform_5fops_2ecu_6',['layout_transform_ops.cu',['../layout__transform__ops_8cu.html',1,'']]], + ['layout_5ftransform_5fops_2ecuh_7',['layout_transform_ops.cuh',['../layout__transform__ops_8cuh.html',1,'']]], + ['layout_5ftransform_5fops_5fcpu_2ecpp_8',['layout_transform_ops_cpu.cpp',['../layout__transform__ops__cpu_8cpp.html',1,'']]], + ['layout_5ftransform_5fops_5fgpu_2ecpp_9',['layout_transform_ops_gpu.cpp',['../layout__transform__ops__gpu_8cpp.html',1,'']]], + ['ldg_10',['LDG',['../sparse__ops_2common_8cuh.html#a9e7ecd25c1168b19568b2ba40a731c39',1,'common.cuh']]], + ['learning_5frate_11',['learning_rate',['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#a205f082b0bb0cee9301dc4e5d0521b5c',1,'gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['left_12',['left',['../namespacefbgemm__gpu.html#ad8f5e19e19f12974c9713e920ec54331',1,'fbgemm_gpu']]], + ['length_5fto_5ffeature_5fidx_13',['length_to_feature_idx',['../namespacefbgemm__gpu.html#a10c64e822d3634da34b9bf1f0c38d757',1,'fbgemm_gpu']]], + ['lengths_14',['lengths',['../namespacefbgemm__gpu.html#acbebb5d71fe9389f7b919325112c1548',1,'fbgemm_gpu']]], + ['lengths_5faddrs_15',['lengths_addrs',['../namespacefbgemm__gpu.html#a2b15eac55dd0239102e264b41febb49f',1,'fbgemm_gpu']]], + ['lengths_5fend_16',['lengths_end',['../namespacefbgemm__gpu.html#a80de4cfcf0b435f1edbf9ba9cb999695',1,'fbgemm_gpu']]], + ['lengths_5fis_5flong_17',['lengths_is_long',['../namespacefbgemm__gpu.html#ad8b8d41e5b0a7f0f67d18d46f561eef8',1,'fbgemm_gpu']]], + ['lengths_5foffsets_18',['lengths_offsets',['../namespacefbgemm__gpu.html#ab245b3e7b831d8e003a353250359843d',1,'fbgemm_gpu']]], + ['lengths_5frange_19',['lengths_range',['../namespacefbgemm__gpu.html#a9599d315f833a6d562ee1d25d4ee5923',1,'fbgemm_gpu']]], + ['lengths_5frange_5fcuda_20',['lengths_range_cuda',['../namespacefbgemm__gpu.html#ace0a963a484e5501c50533122cdecc3c',1,'fbgemm_gpu']]], + ['lengths_5frange_5fout_21',['lengths_range_out',['../namespacefbgemm__gpu.html#a19280a435704ff4093b148460c37bc84',1,'fbgemm_gpu']]], + ['lengths_5fstart_22',['lengths_start',['../namespacefbgemm__gpu.html#a332f5a97c570870675755b52b91919d6',1,'fbgemm_gpu']]], + ['lfu_5fcache_5ffind_2ecu_23',['lfu_cache_find.cu',['../lfu__cache__find_8cu.html',1,'']]], + ['lfu_5fcache_5ffind_5funcached_5fcuda_24',['lfu_cache_find_uncached_cuda',['../namespacefbgemm__gpu.html#a9e8721a4003045038e10d3a4c8258c96',1,'fbgemm_gpu']]], + ['lfu_5fcache_5fpopulate_2ecu_25',['lfu_cache_populate.cu',['../lfu__cache__populate_8cu.html',1,'']]], + ['lfu_5fcache_5fpopulate_5fbyte_2ecpp_26',['lfu_cache_populate_byte.cpp',['../lfu__cache__populate__byte_8cpp.html',1,'']]], + ['lfu_5fcache_5fpopulate_5fbyte_2ecu_27',['lfu_cache_populate_byte.cu',['../lfu__cache__populate__byte_8cu.html',1,'']]], + ['lfu_5fcache_5fpopulate_5fbyte_5fcpu_28',['lfu_cache_populate_byte_cpu',['../namespacefbgemm__gpu.html#a45bb3081a2688f09448ffda6bc5d5f2e',1,'fbgemm_gpu']]], + ['lfu_5fcache_5fpopulate_5fbyte_5fcuda_29',['lfu_cache_populate_byte_cuda',['../group__table-batched-embed-cuda.html#ga2b76a0cf452f00e77696d896d7a402f3',1,'lfu_cache_populate_byte_cuda(at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, int64_t row_alignment): lfu_cache_populate_byte.cu'],['../group__table-batched-embed-cuda.html#ga2b76a0cf452f00e77696d896d7a402f3',1,'lfu_cache_populate_byte_cuda(Tensor weights, Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor linear_cache_indices, Tensor lxu_cache_state, Tensor lxu_cache_weights, Tensor lfu_state, int64_t row_alignment): lfu_cache_populate_byte.cu']]], + ['lfu_5fcache_5fpopulate_5fcuda_30',['lfu_cache_populate_cuda',['../group__table-batched-embed-cuda.html#ga854b8951ef7e78da812be97041d7d2dc',1,'lfu_cache_populate_cuda(at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, bool stochastic_rounding): lfu_cache_populate.cu'],['../group__table-batched-embed-cuda.html#ga854b8951ef7e78da812be97041d7d2dc',1,'lfu_cache_populate_cuda(Tensor weights, Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor D_offsets, Tensor linear_cache_indices, Tensor lxu_cache_state, Tensor lxu_cache_weights, Tensor lfu_state, bool stochastic_rounding): lfu_cache_populate.cu']]], + ['lfu_5fupdate_5fcounts_5fcuda_31',['lfu_update_counts_cuda',['../namespacefbgemm__gpu.html#aca510adc64caa635df004e9b419bbb1b',1,'fbgemm_gpu']]], + ['linear_5findex_32',['linear_index',['../namespacefbgemm__gpu.html#a177d197b75db75ee70711f48a28e1524',1,'fbgemm_gpu']]], + ['linearize_5fcache_5findices_2ecpp_33',['linearize_cache_indices.cpp',['../linearize__cache__indices_8cpp.html',1,'']]], + ['linearize_5fcache_5findices_2ecu_34',['linearize_cache_indices.cu',['../linearize__cache__indices_8cu.html',1,'']]], + ['linearize_5fcache_5findices_5fcpu_35',['linearize_cache_indices_cpu',['../namespacefbgemm__gpu.html#a6eaeebeb996c343db6d076fce7952133',1,'fbgemm_gpu']]], + ['linearize_5fcache_5findices_5fcuda_36',['linearize_cache_indices_cuda',['../group__table-batched-embed-cuda.html#ga23e7545e51b296d9b72c86f37c360dc6',1,'linearize_cache_indices_cuda(at::Tensor cache_hash_size_cumsum, at::Tensor indices, at::Tensor offsets): linearize_cache_indices.cu'],['../group__table-batched-embed-cuda.html#ga23e7545e51b296d9b72c86f37c360dc6',1,'linearize_cache_indices_cuda(Tensor cache_hash_size_cumsum, Tensor indices, Tensor offsets): linearize_cache_indices.cu']]], + ['linearize_5fcache_5findices_5ffrom_5frow_5fidx_5fcpu_37',['linearize_cache_indices_from_row_idx_cpu',['../namespacefbgemm__gpu.html#a9c7ab59a89fd36f5c07b9c86bdc891c8',1,'fbgemm_gpu']]], + ['linearize_5fcache_5findices_5ffrom_5frow_5fidx_5fcuda_38',['linearize_cache_indices_from_row_idx_cuda',['../group__table-batched-embed-cuda.html#ga6eed85d3e9b5dbef8a753bb81c2d6e05',1,'linearize_cache_indices_from_row_idx_cuda(at::Tensor cache_hash_size_cumsum, at::Tensor update_table_indices, at::Tensor update_row_indices): linearize_cache_indices.cu'],['../group__table-batched-embed-cuda.html#ga6eed85d3e9b5dbef8a753bb81c2d6e05',1,'linearize_cache_indices_from_row_idx_cuda(Tensor cache_hash_size_cumsum, Tensor update_table_indices, Tensor update_row_indices): linearize_cache_indices.cu']]], + ['links_39',['Links',['../topology__utils_8h.html#a434a916b92f4caf48f14d480c6aa845a',1,'topology_utils.h']]], + ['list_5fid_40',['list_id',['../namespacefbgemm__gpu.html#a07403af74afe12cdace7e1ec4ff38e72',1,'fbgemm_gpu']]], + ['load_41',['load',['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a858ccf060c9cb3af78e60a04c7104ff5',1,'fbgemm_gpu::Vec4T< float >::load(const float *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#afaf3bc4be251007b23417bf53b8223db',1,'fbgemm_gpu::Vec4T< float >::load(const double *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a766fc3e4e85cfdbab24e0ba390db0d55',1,'fbgemm_gpu::Vec4T< float >::load(const at::Half *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#ad96458a9ac1be72cc29c0963bf9fcb5b',1,'fbgemm_gpu::Vec4T< float >::load(const at::BFloat16 *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a9f38e7787afcaf85c132d3b7e47ab70f',1,'fbgemm_gpu::Vec4T< float >::load(const uint8_t *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a766fc3e4e85cfdbab24e0ba390db0d55',1,'fbgemm_gpu::Vec4T< at::Half >::load(const at::Half *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#ad96458a9ac1be72cc29c0963bf9fcb5b',1,'fbgemm_gpu::Vec4T< at::Half >::load(const at::BFloat16 *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a858ccf060c9cb3af78e60a04c7104ff5',1,'fbgemm_gpu::Vec4T< at::Half >::load(const float *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#afaf3bc4be251007b23417bf53b8223db',1,'fbgemm_gpu::Vec4T< at::Half >::load(const double *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a9f38e7787afcaf85c132d3b7e47ab70f',1,'fbgemm_gpu::Vec4T< at::Half >::load(const uint8_t *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#ad96458a9ac1be72cc29c0963bf9fcb5b',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::load(const at::BFloat16 *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a766fc3e4e85cfdbab24e0ba390db0d55',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::load(const at::Half *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a858ccf060c9cb3af78e60a04c7104ff5',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::load(const float *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#afaf3bc4be251007b23417bf53b8223db',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::load(const double *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a9f38e7787afcaf85c132d3b7e47ab70f',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::load(const uint8_t *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a766fc3e4e85cfdbab24e0ba390db0d55',1,'fbgemm_gpu::Vec4T< double >::load(const at::Half *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#ad96458a9ac1be72cc29c0963bf9fcb5b',1,'fbgemm_gpu::Vec4T< double >::load(const at::BFloat16 *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a858ccf060c9cb3af78e60a04c7104ff5',1,'fbgemm_gpu::Vec4T< double >::load(const float *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a9f38e7787afcaf85c132d3b7e47ab70f',1,'fbgemm_gpu::Vec4T< double >::load(const uint8_t *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#afaf3bc4be251007b23417bf53b8223db',1,'fbgemm_gpu::Vec4T< double >::load(const double *p)'],['../structfbgemm__gpu_1_1_weight_row.html#a889b0ea41fd15897021ab06b2d62bf29',1,'fbgemm_gpu::WeightRow::load()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#a2b08d5d5c065fbbe307dfa9237f58dc7',1,'fbgemm_gpu::Vec4StepT< STEP, float >::load()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#ad300c1cf97abb3337915a7b9616b371e',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::load()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#a86807843e011cecc10c8f37761f5fc20',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::load()']]], + ['load_5fd_42',['load_D',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a016decd4d08ff2700a397621aff0cd67',1,'load_D: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a016decd4d08ff2700a397621aff0cd67',1,'load_D: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['load_5fd_43',['load_d',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a6aa5afd375a88f7cb364118fde074739',1,'load_d: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a6aa5afd375a88f7cb364118fde074739',1,'load_d: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['load_5fqparams_44',['load_qparams',['../structfbgemm__gpu_1_1_weight_row.html#a5f3a7bac9f71533d09bb41e67708ffc2',1,'fbgemm_gpu::WeightRow']]], + ['load_5fqparams_5ffrom_5frow_45',['load_qparams_from_row',['../namespacefbgemm__gpu.html#a003948b9ad61509936564075f2cead23',1,'fbgemm_gpu']]], + ['loaded_5fvals_46',['loaded_vals',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#a57864e02cf856e5c64f95a762c18151f',1,'fbgemm_gpu::Vec4StepT< STEP, float >::loaded_vals'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#a6de86c6a3f25c34f8b13752e8042ea2e',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::loaded_vals']]], + ['log2_5fcalc_47',['log2_calc',['../structlog2__calc.html',1,'']]], + ['log2_5fcalc_5f_48',['log2_calc_',['../structlog2__calc__.html',1,'']]], + ['log2_5fcalc_5f_3c_200_20_3e_49',['log2_calc_< 0 >',['../structlog2__calc___3_010_01_4.html',1,'']]], + ['logit_5fdata_50',['logit_data',['../namespacefbgemm__gpu.html#a666f6d4fb27d254047edf38944a98e81',1,'fbgemm_gpu']]], + ['lookup_5fbatched_5funary_5fembedding_5ffunction_51',['lookup_batched_unary_embedding_function',['../namespacefbgemm__gpu.html#a74ffde7bbe921424bef364880c5d57ea',1,'fbgemm_gpu']]], + ['lru_5fcache_5ffind_2ecu_52',['lru_cache_find.cu',['../lru__cache__find_8cu.html',1,'']]], + ['lru_5fcache_5ffind_5funcached_5fcuda_53',['lru_cache_find_uncached_cuda',['../group__table-batched-embed-cuda.html#ga76807cfe283a9e8f258818f3f439e6cd',1,'lru_cache_find_uncached_cuda(at::Tensor unique_indices, at::Tensor unique_indices_length, int64_t max_indices, at::Tensor lxu_cache_state, int64_t time_stamp, at::Tensor lru_state, bool gather_cache_stats, at::Tensor uvm_cache_stats, bool lock_cache_line, at::Tensor lxu_cache_locking_counter): lru_cache_find.cu'],['../group__table-batched-embed-cuda.html#ga76807cfe283a9e8f258818f3f439e6cd',1,'lru_cache_find_uncached_cuda(Tensor unique_indices, Tensor unique_indices_length, int64_t max_indices, Tensor lxu_cache_state, int64_t time_stamp, Tensor lru_state, bool gather_cache_stats, Tensor uvm_cache_stats, bool lock_cache_line, Tensor lxu_cache_locking_counter): lru_cache_find.cu']]], + ['lru_5fcache_5fpopulate_2ecu_54',['lru_cache_populate.cu',['../lru__cache__populate_8cu.html',1,'']]], + ['lru_5fcache_5fpopulate_5fbyte_2ecpp_55',['lru_cache_populate_byte.cpp',['../lru__cache__populate__byte_8cpp.html',1,'']]], + ['lru_5fcache_5fpopulate_5fbyte_2ecu_56',['lru_cache_populate_byte.cu',['../lru__cache__populate__byte_8cu.html',1,'']]], + ['lru_5fcache_5fpopulate_5fbyte_5fcpu_57',['lru_cache_populate_byte_cpu',['../namespacefbgemm__gpu.html#a8d6ac45089730a607c2a46a265ac8b7b',1,'fbgemm_gpu']]], + ['lru_5fcache_5fpopulate_5fbyte_5fcuda_58',['lru_cache_populate_byte_cuda',['../group__table-batched-embed-cuda.html#ga5958e4cecc978d415714a3dd691fbc11',1,'lru_cache_populate_byte_cuda(at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, int64_t row_alignment, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats): split_embeddings_cache_cuda.cuh'],['../lru__cache__populate__byte_8cu.html#a53a2183d85282ab5726018767388efe8',1,'lru_cache_populate_byte_cuda(Tensor weights, Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor linear_cache_indices, Tensor lxu_cache_state, Tensor lxu_cache_weights, int64_t time_stamp, Tensor lru_state, int64_t row_alignment, bool gather_cache_stats, c10::optional< Tensor > uvm_cache_stats): lru_cache_populate_byte.cu']]], + ['lru_5fcache_5fpopulate_5fcuda_59',['lru_cache_populate_cuda',['../group__table-batched-embed-cuda.html#ga00d12767ad238d73598bf7dc4d1afa06',1,'lru_cache_populate_cuda(at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, bool stochastic_rounding, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats, bool lock_cache_line, c10::optional< at::Tensor > lxu_cache_locking_counter): split_embeddings_cache_cuda.cuh'],['../lru__cache__populate_8cu.html#ab841aec9d8660e547e492948a2ee9921',1,'lru_cache_populate_cuda(Tensor weights, Tensor cache_hash_size_cumsum, const int64_t total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor D_offsets, Tensor linear_cache_indices, Tensor lxu_cache_state, Tensor lxu_cache_weights, const int64_t time_stamp, Tensor lru_state, const bool stochastic_rounding, bool gather_cache_stats, c10::optional< Tensor > uvm_cache_stats, bool lock_cache_line, c10::optional< Tensor > lxu_cache_locking_counter): lru_cache_populate.cu']]], + ['lt_60',['lt',['../structfbgemm__gpu_1_1_comparator.html#aff9ffad7ca52493418c969769327b704',1,'fbgemm_gpu::Comparator']]], + ['lxu_5fcache_2ecpp_61',['lxu_cache.cpp',['../lxu__cache_8cpp.html',1,'']]], + ['lxu_5fcache_2ecu_62',['lxu_cache.cu',['../lxu__cache_8cu.html',1,'']]], + ['lxu_5fcache_5fflush_5fcuda_63',['lxu_cache_flush_cuda',['../group__table-batched-embed-cuda.html#ga2b055aeb5bf2d99bfb4351271764cab1',1,'lxu_cache_flush_cuda(at::Tensor uvm_weights, at::Tensor cache_hash_size_cumsum, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, int64_t total_D, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, bool stochastic_rounding): lxu_cache.cu'],['../group__table-batched-embed-cuda.html#ga2b055aeb5bf2d99bfb4351271764cab1',1,'lxu_cache_flush_cuda(Tensor uvm_weights, Tensor cache_hash_size_cumsum, Tensor cache_index_table_map, Tensor weights_offsets, Tensor D_offsets, int64_t total_D, Tensor lxu_cache_state, Tensor lxu_cache_weights, bool stochastic_rounding): lxu_cache.cu']]], + ['lxu_5fcache_5flocations_64',['lxu_cache_locations',['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#a60a1ec59d36df78e844d5cd7a0d34f03',1,'lxu_cache_locations: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html#a3c502d4dedd432c8940a937269071ddc',1,'lxu_cache_locations: gen_embedding_forward_split_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a04f448d8b284fc09ac62abe6b241bfb0',1,'lxu_cache_locations: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#a8204f76fc5db4c5c7ac336538fa9da1f',1,'lxu_cache_locations: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#ab8dd6cf2b56fe463818d54d1317d9fff',1,'lxu_cache_locations: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ab8dd6cf2b56fe463818d54d1317d9fff',1,'lxu_cache_locations: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#ab8dd6cf2b56fe463818d54d1317d9fff',1,'lxu_cache_locations: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu']]], + ['lxu_5fcache_5flocations_5fupdate_5fcuda_65',['lxu_cache_locations_update_cuda',['../group__table-batched-embed-cuda.html#ga65cba33a439fb1ed50fe2e80dc22b603',1,'lxu_cache_locations_update_cuda(at::Tensor lxu_cache_locations, at::Tensor lxu_cache_locations_new, c10::optional< at::Tensor > num_uniq_cache_indices): split_embeddings_cache_cuda.cuh'],['../lxu__cache_8cu.html#ac602137fddc0c895b176d959fa3fa8db',1,'lxu_cache_locations_update_cuda(Tensor lxu_cache_locations, Tensor lxu_cache_locations_new, c10::optional< Tensor > num_uniq_cache_indices): lxu_cache.cu']]], + ['lxu_5fcache_5flocking_5fcounter_5fdecrement_5fcuda_66',['lxu_cache_locking_counter_decrement_cuda',['../group__table-batched-embed-cuda.html#gaeaf8f13290f0fe389fefa3fc2a944311',1,'lxu_cache_locking_counter_decrement_cuda(at::Tensor lxu_cache_locking_counter, at::Tensor lxu_cache_locations): lxu_cache.cu'],['../group__table-batched-embed-cuda.html#gaeaf8f13290f0fe389fefa3fc2a944311',1,'lxu_cache_locking_counter_decrement_cuda(at::Tensor lxu_cache_locking_counter, at::Tensor lxu_cache_locations): lxu_cache.cu']]], + ['lxu_5fcache_5flookup_5fcpu_67',['lxu_cache_lookup_cpu',['../namespacefbgemm__gpu.html#ab26f1a83ce47d5510deed9bc9e9d6d9a',1,'fbgemm_gpu']]], + ['lxu_5fcache_5flookup_5fcuda_68',['lxu_cache_lookup_cuda',['../group__table-batched-embed-cuda.html#ga124b70b0fede88f508e59111ce6d765f',1,'lxu_cache_lookup_cuda(at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats, c10::optional< at::Tensor > num_uniq_cache_indices, c10::optional< at::Tensor > lxu_cache_locations_output): split_embeddings_cache_cuda.cuh'],['../lxu__cache_8cu.html#a083f4fd1219188cc40036595fa6921ab',1,'lxu_cache_lookup_cuda(const Tensor linear_cache_indices, const Tensor lxu_cache_state, const int64_t invalid_index, const bool gather_cache_stats, const c10::optional< Tensor > uvm_cache_stats, const c10::optional< Tensor > num_uniq_cache_indices, const c10::optional< Tensor > lxu_cache_locations_output): lxu_cache.cu']]], + ['lxu_5fcache_5fparams_69',['LXU_CACHE_PARAMS',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#ad863bc0421e111195e2ac11c7ad2071d',1,'LXU_CACHE_PARAMS: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ad863bc0421e111195e2ac11c7ad2071d',1,'LXU_CACHE_PARAMS: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['lxu_5fcache_5fweights_70',['lxu_cache_weights',['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#a0c2527424502280dfcf6276b49b41cdc',1,'lxu_cache_weights: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html#a1d665aaf5a0d98bca13be6c158653005',1,'lxu_cache_weights: gen_embedding_forward_split_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a6a68ceee01fdfd5d0a31881988c095ae',1,'lxu_cache_weights: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#a2581c8ea9d11ed091efe32b3ec6d2920',1,'lxu_cache_weights: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#a6a68ceee01fdfd5d0a31881988c095ae',1,'lxu_cache_weights: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a6a68ceee01fdfd5d0a31881988c095ae',1,'lxu_cache_weights: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#a6a68ceee01fdfd5d0a31881988c095ae',1,'lxu_cache_weights: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#aac2986832e167da4c333ea92ea3deff2',1,'lxu_cache_weights: gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['lxu_5fparams_5fcnt_71',['LXU_PARAMS_CNT',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#ae2a8bf21f0c677246d8d102686641b65',1,'LXU_PARAMS_CNT: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ae2a8bf21f0c677246d8d102686641b65',1,'LXU_PARAMS_CNT: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#ae2a8bf21f0c677246d8d102686641b65',1,'LXU_PARAMS_CNT: embedding_forward_split_kernel_v2_template.cu']]] ]; diff --git a/search/all_d.js b/search/all_d.js index 12d4e8ad5..f89e18a2b 100644 --- a/search/all_d.js +++ b/search/all_d.js @@ -1,4 +1,54 @@ var searchData= [ - ['permutation_20operators_0',['permutation operators',['../group__permute-pooled-embs-cpu.html',1,'CPU Permutation Operators'],['../group__permute-pooled-embs-gpu.html',1,'CUDA Permutation Operators']]] + ['main_0',['main',['../_c_make_c_compiler_id_8c.html#a0ddf1224851353fc92bfbff6f499fa97',1,'main(int argc, char *argv[]): CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#a0ddf1224851353fc92bfbff6f499fa97',1,'main(int argc, char *argv[]): CMakeCXXCompilerId.cpp'],['../verify__fp16__stochastic__benchmark_8cu.html#a0ddf1224851353fc92bfbff6f499fa97',1,'main(int argc, char *argv[]): verify_fp16_stochastic_benchmark.cu']]], + ['make_5fpacked_5ftensor_5faccessor32_1',['make_packed_tensor_accessor32',['../fbgemm__tensor__accessor_8h.html#ae5c092ed88e41832d415d06d837889b3',1,'fbgemm_tensor_accessor.h']]], + ['make_5fpacked_5ftensor_5faccessor64_2',['make_packed_tensor_accessor64',['../fbgemm__tensor__accessor_8h.html#add453d9931017b7ca11b84095566ae26',1,'fbgemm_tensor_accessor.h']]], + ['make_5fpacked_5ftensor_5faccessor_5facc_5ftype_5fbase_3',['MAKE_PACKED_TENSOR_ACCESSOR_ACC_TYPE_BASE',['../fbgemm__tensor__accessor_8h.html#ae2a2547758e08761f973874a074b4fc1',1,'fbgemm_tensor_accessor.h']]], + ['make_5fpacked_5ftensor_5faccessor_5fbase_4',['MAKE_PACKED_TENSOR_ACCESSOR_BASE',['../fbgemm__tensor__accessor_8h.html#ad5bf508fef6a8c9528a8f1c316bfd491',1,'fbgemm_tensor_accessor.h']]], + ['make_5fpta_5facc_5fwith_5fname_5',['MAKE_PTA_ACC_WITH_NAME',['../fbgemm__tensor__accessor_8h.html#a23a5f2ae4f72b11bd67c678ae14d9af7',1,'fbgemm_tensor_accessor.h']]], + ['make_5fpta_5fwith_5fname_6',['MAKE_PTA_WITH_NAME',['../fbgemm__tensor__accessor_8h.html#a614f4b016e2758186bd598bc3be6e6cf',1,'fbgemm_tensor_accessor.h']]], + ['make_5fzero_5ffloat2_7',['make_zero_float2',['../namespacefbgemm__gpu.html#a25e94d75c07b4c2bc5427fe771f2d60d',1,'fbgemm_gpu']]], + ['make_5fzero_5ffloat4_8',['make_zero_float4',['../namespacefbgemm__gpu.html#afca9b335bed360fc1ec3e239183a792f',1,'fbgemm_gpu']]], + ['make_5fzero_5ffloat8_9',['make_zero_float8',['../namespacefbgemm__gpu.html#a66822cc23f92dbb8c18c596511b2a917',1,'fbgemm_gpu']]], + ['make_5fzero_5ffloat_5f16_10',['make_zero_float_16',['../namespacefbgemm__gpu.html#a7dcc205dbf44fb2e80d62bf47eb6c4c4',1,'fbgemm_gpu']]], + ['managed_11',['MANAGED',['../namespacefbgemm__gpu.html#a8f04cbe33fa88d1e420c06b1f8879194af59a25f2594f469f0bfccad7f8f13744',1,'fbgemm_gpu']]], + ['managed_5fcaching_12',['MANAGED_CACHING',['../namespacefbgemm__gpu.html#a8f04cbe33fa88d1e420c06b1f8879194a3664f93edf39a3e7e0a84f3cefb624a6',1,'fbgemm_gpu']]], + ['mask_13',['mask',['../jagged__tensor__ops_2common_8cuh.html#a7d162c8b2172ea2cb7a10852acacc635',1,'common.cuh']]], + ['masked_5findex_5fput_5fbyte_5fcuda_14',['masked_index_put_byte_cuda',['../ssd__split__table__batched__embeddings_8cpp.html#ac6846069e59fcf7c6fad94b1321b0dd0',1,'ssd_split_table_batched_embeddings.cpp']]], + ['masked_5findex_5fput_5fcuda_15',['masked_index_put_cuda',['../ssd__split__embeddings__cache__cuda_8cu.html#a8a561f5585f09252076650c0d34457d7',1,'masked_index_put_cuda(Tensor self, Tensor indices, Tensor values, Tensor count): ssd_split_embeddings_cache_cuda.cu'],['../ssd__split__table__batched__embeddings_8cpp.html#a8a561f5585f09252076650c0d34457d7',1,'masked_index_put_cuda(Tensor self, Tensor indices, Tensor values, Tensor count): ssd_split_embeddings_cache_cuda.cu']]], + ['masked_5fselect_5fjagged_5f1d_16',['masked_select_jagged_1d',['../namespacefbgemm__gpu.html#a0223abaee318471a5e42318a1b7056b6',1,'fbgemm_gpu']]], + ['max_17',['max',['../fbgemm__cuda__utils_8cuh.html#affe776513b24d84b39af8ab0930fef7f',1,'max: fbgemm_cuda_utils.cuh'],['../namespacefbgemm__gpu.html#a5f0a51933b0e3b1a96d8806d702ff82e',1,'fbgemm_gpu::max()']]], + ['max_5fb_18',['MAX_B',['../split__embeddings__utils_8cuh.html#a8fe8da855c3ca31f1825ef6779aa2458',1,'split_embeddings_utils.cuh']]], + ['max_5fd_19',['max_D',['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#a4b79f2e1c1afb0ee9291f6d406038bd7',1,'gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['max_5fd_5fcache_20',['max_D_cache',['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#a179f256aa33ee3f02b437129f3186a4c',1,'max_D_cache: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a5c99f8e3e9c924534ce3075312e4b34a',1,'max_D_cache: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#a179f256aa33ee3f02b437129f3186a4c',1,'max_D_cache: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#a5c99f8e3e9c924534ce3075312e4b34a',1,'max_D_cache: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a5c99f8e3e9c924534ce3075312e4b34a',1,'max_D_cache: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#a5c99f8e3e9c924534ce3075312e4b34a',1,'max_D_cache: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu']]], + ['max_5felements_5fper_5fthread_21',['MAX_ELEMENTS_PER_THREAD',['../namespacefbgemm__gpu.html#af75fe947c4a976895a9fb2c7501439b1',1,'fbgemm_gpu']]], + ['max_5fentries_5fper_5fblock_22',['MAX_ENTRIES_PER_BLOCK',['../metric__ops_8cu.html#a9c50cd9bc9f8be967692db87d6fdf57f',1,'metric_ops.cu']]], + ['max_5fnorm_23',['max_norm',['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#a5dad34a0e8f59dfa6c15365b5f987ba6',1,'gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['max_5ft_24',['MAX_T',['../split__embeddings__utils_8cuh.html#a83944439cec525d70fcf8281a639760d',1,'split_embeddings_utils.cuh']]], + ['mean_25',['MEAN',['../namespacefbgemm__gpu.html#aa1f721fe0d5e5a710e7a05f788f01f5da4ea6d1161ea24d7599365f574aff6610',1,'fbgemm_gpu']]], + ['mean_5fpooling_26',['mean_pooling',['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#aa7749446d7c1da86adc5b7c06dcc7817',1,'mean_pooling: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a3fdf1a4014f7660a86139d200368f74f',1,'mean_pooling: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#aa7749446d7c1da86adc5b7c06dcc7817',1,'mean_pooling: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#a3fdf1a4014f7660a86139d200368f74f',1,'mean_pooling: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a3fdf1a4014f7660a86139d200368f74f',1,'mean_pooling: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#a3fdf1a4014f7660a86139d200368f74f',1,'mean_pooling: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu']]], + ['memory_20operators_27',['CUDA Memory Operators',['../group__cumem-utils.html',1,'']]], + ['memory_5futils_2ecpp_28',['memory_utils.cpp',['../memory__utils_8cpp.html',1,'']]], + ['memory_5futils_2ecu_29',['memory_utils.cu',['../memory__utils_8cu.html',1,'']]], + ['memory_5futils_5fops_2ecpp_30',['memory_utils_ops.cpp',['../memory__utils__ops_8cpp.html',1,'']]], + ['memory_5futils_5fops_2ecu_31',['memory_utils_ops.cu',['../memory__utils__ops_8cu.html',1,'']]], + ['memory_5futils_5fops_5fcpu_2ecpp_32',['memory_utils_ops_cpu.cpp',['../memory__utils__ops__cpu_8cpp.html',1,'']]], + ['merge_20operators_33',['Merge Operators',['../group__merge-pooled-emb.html',1,'']]], + ['merge_5fpooled_5fembedding_5fops_5fcpu_2ecpp_34',['merge_pooled_embedding_ops_cpu.cpp',['../merge__pooled__embedding__ops__cpu_8cpp.html',1,'']]], + ['merge_5fpooled_5fembedding_5fops_5fgpu_2ecpp_35',['merge_pooled_embedding_ops_gpu.cpp',['../merge__pooled__embedding__ops__gpu_8cpp.html',1,'']]], + ['merge_5fpooled_5fembeddings_36',['merge_pooled_embeddings',['../namespacefbgemm__gpu.html#a25ca3ce57c9101b878431d46cc049b50',1,'fbgemm_gpu']]], + ['merge_5fpooled_5fembeddings_2eh_37',['merge_pooled_embeddings.h',['../merge__pooled__embeddings_8h.html',1,'']]], + ['merge_5fpooled_5fembeddings_5fcpu_38',['merge_pooled_embeddings_cpu',['../namespacefbgemm__gpu.html#aad2aea0289bc3c5d135846ee32e0638c',1,'fbgemm_gpu']]], + ['metric_5fops_2ecu_39',['metric_ops.cu',['../metric__ops_8cu.html',1,'']]], + ['metric_5fops_2eh_40',['metric_ops.h',['../metric__ops_8h.html',1,'']]], + ['metric_5fops_5fhost_2ecpp_41',['metric_ops_host.cpp',['../metric__ops__host_8cpp.html',1,'']]], + ['min_42',['min',['../fbgemm__cuda__utils_8cuh.html#ac6afabdc09a49a433ee19d8a9486056d',1,'min: fbgemm_cuda_utils.cuh'],['../namespacefbgemm__gpu.html#a5b62c5028106dcf10b450a8f178338ad',1,'fbgemm_gpu::min()']]], + ['mod_43',['Mod',['../classfbgemm__gpu_1_1_fixed_divisor.html#a604d46db75c43e0cd210e5b2ab2bc7e6',1,'fbgemm_gpu::FixedDivisor']]], + ['momentum1_5fdev_44',['momentum1_dev',['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#af9476d367260e52c6a3bd31824072c06',1,'gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['momentum1_5foffsets_45',['momentum1_offsets',['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#afd2978ce7ef7477233a8bda0aacde4e2',1,'gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['momentum1_5fplacements_46',['momentum1_placements',['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#a10602f96a8b9264528834b6a1763ffb1',1,'gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['momentum1_5fuvm_47',['momentum1_uvm',['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#a3f5ed4cb8fcb526d7476413516fd546f',1,'gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu']]], + ['mul_48',['mul',['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#a16f1fffe0b09a20da784cd647d11bf28',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::mul()'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a16f1fffe0b09a20da784cd647d11bf28',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::mul()'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#a16f1fffe0b09a20da784cd647d11bf28',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::mul()'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#a16f1fffe0b09a20da784cd647d11bf28',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::mul()'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#a16f1fffe0b09a20da784cd647d11bf28',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::mul()'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#a16f1fffe0b09a20da784cd647d11bf28',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::mul()']]], + ['mul_5f_49',['mul_',['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#aacd3ba9c4686c00921e3e2dcc754b000',1,'fbgemm_gpu::Vec4T< float >::mul_()'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#aacd3ba9c4686c00921e3e2dcc754b000',1,'fbgemm_gpu::Vec4T< at::Half >::mul_()'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#aacd3ba9c4686c00921e3e2dcc754b000',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::mul_()'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#aacd3ba9c4686c00921e3e2dcc754b000',1,'fbgemm_gpu::Vec4T< double >::mul_()']]], + ['my_5fsize_50',['my_size',['../namespacefbgemm__gpu.html#a726d1405842124631d2e9543e6abfd70',1,'fbgemm_gpu']]] ]; diff --git a/search/all_e.js b/search/all_e.js index 7c6f6fc5f..4af6bb04a 100644 --- a/search/all_e.js +++ b/search/all_e.js @@ -1,5 +1,45 @@ var searchData= [ - ['quantization_20operators_20for_20cuda_0',['Quantization Operators for CUDA',['../group__quantize-ops-cuda.html',1,'']]], - ['quantize_20data_20cpu_20operators_1',['Quantize Data CPU Operators',['../group__quantize-data-cpu.html',1,'']]] + ['n_0',['n',['../namespacefbgemm__gpu.html#a4e34aefb3cc5403a07c020131077100a',1,'fbgemm_gpu']]], + ['name_5f_1',['name_',['../classfbgemm__gpu_1_1enum__registration.html#a7dac8366c11fbcad2f49d85fe8fc4fbe',1,'fbgemm_gpu::enum_registration']]], + ['native_5fempty_5flike_2',['native_empty_like',['../namespacefbgemm__gpu.html#a2f18d44e708cafd185e02defd95fb774',1,'fbgemm_gpu']]], + ['nbit_3',['nbit',['../namespacenbit.html',1,'']]], + ['ndim_4',['ndim',['../struct_stack_array.html#a7cff664dfb347e3967c24b7c4ebe0fa9',1,'StackArray']]], + ['nearest_5frounding_5fvector_5',['nearest_rounding_vector',['../namespacefbgemm__gpu.html#a94744dd15c8d4ffa9c5cf581e499f1ca',1,'fbgemm_gpu::nearest_rounding_vector(dst_t *output, const Vec4T< src_t > &value, const float2)'],['../namespacefbgemm__gpu.html#aa56064f3d743f7535d59a1baca06dc1f',1,'fbgemm_gpu::nearest_rounding_vector(uint8_t *output, const Vec4T< float > &value, const float2 qparams)'],['../namespacefbgemm__gpu.html#aefcbaad4af03b4a72b15ca0ca40bc50f',1,'fbgemm_gpu::nearest_rounding_vector(uint8_t *output, const Vec4T< at::Half > &value, const float2 qparams)'],['../namespacefbgemm__gpu.html#aa8fa436e2338f97218eff8a48c94d8a4',1,'fbgemm_gpu::nearest_rounding_vector(uint8_t *output, const Vec4T< double > &value, const float2 qparams)']]], + ['new_5fhost_5fmapped_5ftensor_6',['new_host_mapped_tensor',['../group__cumem-utils.html#ga5663643a8ac5de83063d0ff51bb9af17',1,'fbgemm_gpu']]], + ['new_5findices_5fdata_7',['new_indices_data',['../namespacefbgemm__gpu.html#a12ee89697c142bf6626fc9773b3784ce',1,'fbgemm_gpu']]], + ['new_5flengths_5fdata_8',['new_lengths_data',['../namespacefbgemm__gpu.html#a2590465d158f637aa65cb705ceff155d',1,'fbgemm_gpu']]], + ['new_5fmanaged_5ftensor_9',['new_managed_tensor',['../group__cumem-utils.html#gab708b23762a11187eb6a32a36f0e34a3',1,'fbgemm_gpu']]], + ['new_5fmanaged_5ftensor_5fmeta_10',['new_managed_tensor_meta',['../group__cumem-utils.html#ga5351c6ec3de203476cf09df330455d91',1,'fbgemm_gpu']]], + ['new_5foffsets_5fdata_11',['new_offsets_data',['../namespacefbgemm__gpu.html#ab36576a24b49bfce1e9b6ff66a37ebe1',1,'fbgemm_gpu']]], + ['new_5fpos_5fdata_12',['new_pos_data',['../namespacefbgemm__gpu.html#a1543754093f5b3f003f28b6120d4508f',1,'fbgemm_gpu']]], + ['new_5funified_5ftensor_13',['new_unified_tensor',['../group__cumem-utils.html#ga6f8847537ea9ed13fc7e2e378bc79b1f',1,'fbgemm_gpu']]], + ['new_5funified_5ftensor_5fcpu_14',['new_unified_tensor_cpu',['../namespacefbgemm__gpu.html#aad6847fe2dc2433889aeb2dddf14f496',1,'fbgemm_gpu']]], + ['new_5fvanilla_5fmanaged_5ftensor_15',['new_vanilla_managed_tensor',['../group__cumem-utils.html#gad5e0d2307667c3db5e73f0c0eec15df5',1,'fbgemm_gpu']]], + ['new_5fweights_5fdata_16',['new_weights_data',['../namespacefbgemm__gpu.html#a2aaf9a58df0549a13d01ab53cd60ddff',1,'fbgemm_gpu']]], + ['next_5f_17',['next_',['../classfbgemm__gpu_1_1enum__registration.html#a8797d90a1e9ec2163cb8192a962d06dd',1,'fbgemm_gpu::enum_registration']]], + ['next_5foffset_18',['next_offset',['../namespacefbgemm__gpu.html#ac588c52c993fa6f169cb54d418ea584c',1,'fbgemm_gpu']]], + ['node_19',['Node',['../topology__utils_8h.html#a659b93920c81116289ee7ff5d45f48c9',1,'topology_utils.h']]], + ['none_20',['NONE',['../namespacefbgemm__gpu.html#aa1f721fe0d5e5a710e7a05f788f01f5dab50339a10e1de285ac99d4c3990b8693',1,'fbgemm_gpu']]], + ['num_5fargs_21',['NUM_ARGS',['../namespacefbgemm__gpu.html#a711d3a0cadc94f73da860c1ffd01e1b2',1,'fbgemm_gpu']]], + ['num_5fbins_22',['num_bins',['../namespacefbgemm__gpu.html#aff8ee4d321b4a815868fe53b25b8fe6b',1,'fbgemm_gpu']]], + ['num_5fcalls_23',['num_calls',['../namespacefbgemm__gpu.html#aefeeb0d13ba9b557b8d693c43e5a43aaadaf139c74384603431fd1bbb3347aa34',1,'fbgemm_gpu']]], + ['num_5fcols_5fgroup_24',['num_cols_group',['../namespacefbgemm__gpu.html#a2f734f38c3537666ae53e906e65c1a6e',1,'fbgemm_gpu']]], + ['num_5fconflict_5fmisses_25',['num_conflict_misses',['../namespacefbgemm__gpu.html#aefeeb0d13ba9b557b8d693c43e5a43aaac0cd9dffdb3c001656bee52db850d1c6',1,'fbgemm_gpu']]], + ['num_5fconflict_5funique_5fmisses_26',['num_conflict_unique_misses',['../namespacefbgemm__gpu.html#aefeeb0d13ba9b557b8d693c43e5a43aaa30ee3b3c17bbfefe571f4ea5e99b00d6',1,'fbgemm_gpu']]], + ['num_5flists_27',['num_lists',['../namespacefbgemm__gpu.html#a447e5ea8dc79992a05131d8803d2bf7e',1,'fbgemm_gpu']]], + ['num_5fnon_5fzero_5fcolumns_28',['num_non_zero_columns',['../structinternal_1_1_hyper_compressed_sparse_column.html#a601991d88e6582d3bdb8bba778842c25',1,'internal::HyperCompressedSparseColumn']]], + ['num_5foffsets_5fper_5fwarp_29',['NUM_OFFSETS_PER_WARP',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a507523ed27f39808542bbb3b9c1382af',1,'NUM_OFFSETS_PER_WARP: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a507523ed27f39808542bbb3b9c1382af',1,'NUM_OFFSETS_PER_WARP: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['num_5fparams_30',['NUM_PARAMS',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a586264efd35f61c1e5b73ab1fd4f87a5',1,'NUM_PARAMS: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a586264efd35f61c1e5b73ab1fd4f87a5',1,'NUM_PARAMS: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['num_5frequested_5findices_31',['num_requested_indices',['../namespacefbgemm__gpu.html#aefeeb0d13ba9b557b8d693c43e5a43aaacf3fcf7ace9b3a5b4ab424c874b84439',1,'fbgemm_gpu']]], + ['num_5fsegments_32',['num_segments',['../namespacefbgemm__gpu.html#a13adcdfa105d3fe5d68bfeae4df5f017',1,'fbgemm_gpu']]], + ['num_5fthreads_5fper_5fblock_33',['NUM_THREADS_PER_BLOCK',['../metric__ops_8cu.html#ac147221d5b74086a08d3623657d16517',1,'metric_ops.cu']]], + ['num_5funique_5findices_34',['num_unique_indices',['../namespacefbgemm__gpu.html#aefeeb0d13ba9b557b8d693c43e5a43aaaa555e0f1fe32e24cc25b049fdf3d0afc',1,'fbgemm_gpu']]], + ['num_5funique_5fmisses_35',['num_unique_misses',['../namespacefbgemm__gpu.html#aefeeb0d13ba9b557b8d693c43e5a43aaaabea3db589a421890b799e0ac63dfc53',1,'fbgemm_gpu']]], + ['num_5fwarps_36',['NUM_WARPS',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aef84dc9fc9b8afa43b8fed4684630167',1,'NUM_WARPS: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aef84dc9fc9b8afa43b8fed4684630167',1,'NUM_WARPS: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['num_5fwarps_5ffor_5fsmall_5fl_37',['num_warps_for_small_L',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a1df1c715b5de4bbc9d9d9a5d78122a92',1,'num_warps_for_small_L: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a1df1c715b5de4bbc9d9d9a5d78122a92',1,'num_warps_for_small_L: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['num_5fwarps_5fper_5frow_38',['num_warps_per_row',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#af324e8b39fc546b4a54e9436513e33b9',1,'num_warps_per_row: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#af324e8b39fc546b4a54e9436513e33b9',1,'num_warps_per_row: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['num_5fwork_5frows_39',['num_work_rows',['../namespacefbgemm__gpu.html#aeb2ce03cab381b1393d4c7c355ef2286',1,'fbgemm_gpu']]], + ['numel_5f_40',['numel_',['../classfbgemm__gpu_1_1_tensor_accessor_base.html#af0cbc4b5fa9aa44d9a44f4df77fa5c2d',1,'fbgemm_gpu::TensorAccessorBase::numel_'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#af0cbc4b5fa9aa44d9a44f4df77fa5c2d',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::numel_']]], + ['nvml_5fcheck_41',['NVML_CHECK',['../topology__utils_8cpp.html#af1ec00426a14a4658189ab308ea76636',1,'topology_utils.cpp']]] ]; diff --git a/search/all_f.js b/search/all_f.js index 202af0400..46737d9e0 100644 --- a/search/all_f.js +++ b/search/all_f.js @@ -1,5 +1,19 @@ var searchData= [ - ['sparse_20data_20cpu_20operators_0',['Sparse Data CPU Operators',['../group__sparse-data-cpu.html',1,'']]], - ['sparse_20data_20cuda_20operators_1',['Sparse Data CUDA Operators',['../group__sparse-data-cuda.html',1,'']]] + ['offset_5ftbe_5finput_5fcombine_5fwith_5flength_5fargs_0',['offset_tbe_input_combine_with_length_args',['../namespacefbgemm__gpu.html#ab6871043c7881b5434de1e8eea491c80',1,'fbgemm_gpu']]], + ['offsets_1',['offsets',['../gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html#aff2584a62b3409906c19c5419a4cc647',1,'offsets: gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#afc0762ff936d64a73eef3c78b9585024',1,'offsets: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html#aab1af2e6ba28faa781f71e91f2347d43',1,'offsets: gen_embedding_forward_split_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a6ed0a81091088c3c07a10b7fd8e63358',1,'offsets: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#afc0762ff936d64a73eef3c78b9585024',1,'offsets: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#a7f8b83bcbf1f5b73f650fb246a02a2fe',1,'offsets: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a7f8b83bcbf1f5b73f650fb246a02a2fe',1,'offsets: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#a7f8b83bcbf1f5b73f650fb246a02a2fe',1,'offsets: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../namespacefbgemm__gpu.html#aad33dfd216d9ea27b505a304ca3e32da',1,'fbgemm_gpu::offsets']]], + ['offsets_5fdata_2',['offsets_data',['../namespacefbgemm__gpu.html#a66f41f5ea495c26af7e2007fe0a28edc',1,'fbgemm_gpu']]], + ['offsets_5frange_5fcpu_3',['offsets_range_cpu',['../namespacefbgemm__gpu.html#a5aff23a0a3b0bc872ba44a0045b6e350',1,'fbgemm_gpu']]], + ['offsets_5frange_5fcuda_4',['offsets_range_cuda',['../namespacefbgemm__gpu.html#a3d88da2f7a769565c9ebdc070467eabe',1,'fbgemm_gpu']]], + ['operator_5b_5d_5',['operator[]',['../classfbgemm__gpu_1_1_tensor_accessor.html#a72a3b6251f6388b00f3edcd8d3311600',1,'fbgemm_gpu::TensorAccessor::operator[](index_t i)'],['../classfbgemm__gpu_1_1_tensor_accessor.html#a16735630a1b17005797473122c151321',1,'fbgemm_gpu::TensorAccessor::operator[](index_t i) const'],['../classfbgemm__gpu_1_1_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html#a00a4aa208155f5c8a633eddc32351081',1,'fbgemm_gpu::TensorAccessor< T, 1, PtrTraits, index_t >::operator[](index_t i)'],['../classfbgemm__gpu_1_1_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html#a3b81b97c0e920adcd47b7f6a5b0af0cf',1,'fbgemm_gpu::TensorAccessor< T, 1, PtrTraits, index_t >::operator[](index_t i) const'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor.html#ab6e8f8fe313b1de35e94636bdd4e34dd',1,'fbgemm_gpu::GenericPackedTensorAccessor::operator[](index_t i)'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor.html#a6933e03eff2b2428f9eb67e597a520c1',1,'fbgemm_gpu::GenericPackedTensorAccessor::operator[](index_t i) const'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html#a3593eea2d954fec0db1139e509206816',1,'fbgemm_gpu::GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >::operator[](index_t i)'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html#a07dee357cdcdf158224410aaf987e7d3',1,'fbgemm_gpu::GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >::operator[](index_t i) const']]], + ['operators_6',['Operators',['../group__input-combine.html',1,'Combine Input Operators'],['../group__cumem-utils.html',1,'CUDA Memory Operators'],['../group__table-batched-embed-cuda.html',1,'CUDA Operators'],['../group__embedding-cpu.html',1,'Embedding CPU Operators'],['../group__embedding-cuda.html',1,'Embedding CUDA Operators'],['../group__jagged-tensor-ops-cuda.html',1,'Jagged Tensor CUDA Operators'],['../group__jagged-tensor-ops-cpu.html',1,'Jagged Tensor Operators'],['../group__layout-transform-cpu.html',1,'Layout Transformation CPU Operators'],['../group__layout-transform-cuda.html',1,'Layout Transformation CUDA Operators'],['../group__merge-pooled-emb.html',1,'Merge Operators'],['../group__quantize-data-cpu.html',1,'Quantize Data CPU Operators'],['../group__sparse-data-cpu.html',1,'Sparse Data CPU Operators'],['../group__sparse-data-cuda.html',1,'Sparse Data CUDA Operators']]], + ['operators_20cpu_7',['Permute Pooled Embeddings Operators (CPU)',['../group__permute-pooled-embs-cpu.html',1,'']]], + ['operators_20cuda_8',['Operators CUDA',['../group__permute-pooled-embs-gpu.html',1,'Permute Pooled Embeddings Operators (CUDA)'],['../group__quantize-ops-cuda.html',1,'Quantization Operators (CUDA)']]], + ['ops_5futils_2eh_9',['ops_utils.h',['../ops__utils_8h.html',1,'']]], + ['output_10',['output',['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html#a110a71f81fecd3888738618492db1672',1,'output: gen_batch_index_select_dim0_forward_kernel_small.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html#af6e6ad15bb4078d9c64b33a85e9618ec',1,'output: gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#a8cb737489e5e5b8dc4db6de0b9c96a6f',1,'output: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html#a511b9e94b01de29a6671f16533eaf6dd',1,'output: gen_embedding_forward_split_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa742bdb164d113128d3e9b155f95acfe',1,'output: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#a8cb737489e5e5b8dc4db6de0b9c96a6f',1,'output: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#a85cde3a0577b44c06afc80d802b86dc6',1,'output: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a85cde3a0577b44c06afc80d802b86dc6',1,'output: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#a85cde3a0577b44c06afc80d802b86dc6',1,'output: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../namespacefbgemm__gpu.html#ae662e9187ce6364e1668803dfbf7e7d0',1,'fbgemm_gpu::output']]], + ['output_5fdata_11',['output_data',['../namespacefbgemm__gpu.html#a783fcd132908afcc711d1a7fb2cb51a7',1,'fbgemm_gpu']]], + ['output_5foffsets_12',['output_offsets',['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html#a48df3803c4f164ff153d5348c6f8f10f',1,'output_offsets: gen_batch_index_select_dim0_forward_kernel_small.cu'],['../namespacefbgemm__gpu.html#a72822c0cc98165904fdc0110344ecdd5',1,'fbgemm_gpu::output_offsets']]], + ['output_5fpermute_13',['output_permute',['../namespacefbgemm__gpu.html#aa80e8b11fe8b3b1a619f329aeb089f54',1,'fbgemm_gpu']]], + ['output_5fptrs_14',['output_ptrs',['../namespacefbgemm__gpu.html#a038ee34932113e6d3d38345920211f4c',1,'fbgemm_gpu']]], + ['output_5fvec_5ft_15',['output_vec_t',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a46b036c211c222352709e6bb2420878d',1,'output_vec_t: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a46b036c211c222352709e6bb2420878d',1,'output_vec_t: gen_embedding_forward_split_weighted_v2_kernel.cu']]] ]; diff --git a/search/classes_0.js b/search/classes_0.js index bc3b5cb3a..1a304ee85 100644 --- a/search/classes_0.js +++ b/search/classes_0.js @@ -1,4 +1,4 @@ var searchData= [ - ['comparator_0',['Comparator',['../structfbgemm__gpu_1_1_comparator.html',1,'fbgemm_gpu']]] + ['bitonicsort_0',['BitonicSort',['../structfbgemm__gpu_1_1_bitonic_sort.html',1,'fbgemm_gpu']]] ]; diff --git a/search/classes_1.js b/search/classes_1.js new file mode 100644 index 000000000..bc3b5cb3a --- /dev/null +++ b/search/classes_1.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['comparator_0',['Comparator',['../structfbgemm__gpu_1_1_comparator.html',1,'fbgemm_gpu']]] +]; diff --git a/search/classes_2.js b/search/classes_2.js new file mode 100644 index 000000000..2ccbab8bb --- /dev/null +++ b/search/classes_2.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['defaultptrtraits_0',['DefaultPtrTraits',['../structfbgemm__gpu_1_1_default_ptr_traits.html',1,'fbgemm_gpu']]] +]; diff --git a/search/classes_3.js b/search/classes_3.js new file mode 100644 index 000000000..cddc09ee6 --- /dev/null +++ b/search/classes_3.js @@ -0,0 +1,5 @@ +var searchData= +[ + ['embeddingrocksdb_0',['EmbeddingRocksDB',['../classssd_1_1_embedding_rocks_d_b.html',1,'ssd']]], + ['enum_5fregistration_1',['enum_registration',['../classfbgemm__gpu_1_1enum__registration.html',1,'fbgemm_gpu']]] +]; diff --git a/search/classes_4.js b/search/classes_4.js new file mode 100644 index 000000000..11983cd4d --- /dev/null +++ b/search/classes_4.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['fixeddivisor_0',['FixedDivisor',['../classfbgemm__gpu_1_1_fixed_divisor.html',1,'fbgemm_gpu']]] +]; diff --git a/search/classes_5.js b/search/classes_5.js new file mode 100644 index 000000000..91451dbcb --- /dev/null +++ b/search/classes_5.js @@ -0,0 +1,8 @@ +var searchData= +[ + ['genericpackedtensoraccessor_0',['GenericPackedTensorAccessor',['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor.html',1,'fbgemm_gpu']]], + ['genericpackedtensoraccessor_3c_20t_2c_201_2c_20ptrtraits_2c_20index_5ft_20_3e_1',['GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >',['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html',1,'fbgemm_gpu']]], + ['genericpackedtensoraccessorbase_2',['GenericPackedTensorAccessorBase',['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html',1,'fbgemm_gpu']]], + ['genericpackedtensoraccessorbase_3c_20t_2c_201_2c_20ptrtraits_2c_20index_5ft_20_3e_3',['GenericPackedTensorAccessorBase< T, 1, PtrTraits, index_t >',['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html',1,'fbgemm_gpu']]], + ['genericpackedtensoraccessorbase_3c_20t_2c_20n_2c_20defaultptrtraits_2c_20int64_5ft_20_3e_4',['GenericPackedTensorAccessorBase< T, N, DefaultPtrTraits, int64_t >',['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html',1,'fbgemm_gpu']]] +]; diff --git a/search/classes_6.js b/search/classes_6.js new file mode 100644 index 000000000..984b6ce92 --- /dev/null +++ b/search/classes_6.js @@ -0,0 +1,5 @@ +var searchData= +[ + ['half4_0',['Half4',['../structfbgemm__gpu_1_1_half4.html',1,'fbgemm_gpu']]], + ['hypercompressedsparsecolumn_1',['HyperCompressedSparseColumn',['../structinternal_1_1_hyper_compressed_sparse_column.html',1,'internal']]] +]; diff --git a/search/classes_7.js b/search/classes_7.js new file mode 100644 index 000000000..2e4a34b1d --- /dev/null +++ b/search/classes_7.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['initializer_0',['Initializer',['../classssd_1_1_initializer.html',1,'ssd']]] +]; diff --git a/search/classes_8.js b/search/classes_8.js new file mode 100644 index 000000000..f0f450ab1 --- /dev/null +++ b/search/classes_8.js @@ -0,0 +1,6 @@ +var searchData= +[ + ['log2_5fcalc_0',['log2_calc',['../structlog2__calc.html',1,'']]], + ['log2_5fcalc_5f_1',['log2_calc_',['../structlog2__calc__.html',1,'']]], + ['log2_5fcalc_5f_3c_200_20_3e_2',['log2_calc_< 0 >',['../structlog2__calc___3_010_01_4.html',1,'']]] +]; diff --git a/search/classes_9.js b/search/classes_9.js new file mode 100644 index 000000000..a47d663bc --- /dev/null +++ b/search/classes_9.js @@ -0,0 +1,5 @@ +var searchData= +[ + ['permutepooledembsfunction_0',['PermutePooledEmbsFunction',['../classfbgemm__gpu_1_1_permute_pooled_embs_function.html',1,'fbgemm_gpu']]], + ['permutepooledembsfunctionsplit_1',['PermutePooledEmbsFunctionSplit',['../classfbgemm__gpu_1_1_permute_pooled_embs_function_split.html',1,'fbgemm_gpu']]] +]; diff --git a/search/classes_a.js b/search/classes_a.js new file mode 100644 index 000000000..d863ea26d --- /dev/null +++ b/search/classes_a.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['rk_5fstate_0',['rk_state',['../structfbgemm__gpu_1_1rk__state.html',1,'fbgemm_gpu']]] +]; diff --git a/search/classes_b.js b/search/classes_b.js new file mode 100644 index 000000000..d53e0111c --- /dev/null +++ b/search/classes_b.js @@ -0,0 +1,12 @@ +var searchData= +[ + ['sharedmemory_0',['SharedMemory',['../structfbgemm__gpu_1_1_shared_memory.html',1,'fbgemm_gpu']]], + ['sharedmemory_3c_20double_20_3e_1',['SharedMemory< double >',['../structfbgemm__gpu_1_1_shared_memory_3_01double_01_4.html',1,'fbgemm_gpu']]], + ['sharedmemory_3c_20float_20_3e_2',['SharedMemory< float >',['../structfbgemm__gpu_1_1_shared_memory_3_01float_01_4.html',1,'fbgemm_gpu']]], + ['sharedmemory_3c_20int32_5ft_20_3e_3',['SharedMemory< int32_t >',['../structfbgemm__gpu_1_1_shared_memory_3_01int32__t_01_4.html',1,'fbgemm_gpu']]], + ['sharedmemory_3c_20int64_5ft_20_3e_4',['SharedMemory< int64_t >',['../structfbgemm__gpu_1_1_shared_memory_3_01int64__t_01_4.html',1,'fbgemm_gpu']]], + ['sharedmemory_3c_20vec4t_3c_20at_3a_3aacc_5ftype_3c_20double_2c_20true_20_3e_20_3e_20_3e_5',['SharedMemory< Vec4T< at::acc_type< double, true > > >',['../structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01double_00_01true_01_4_01_4_01_4.html',1,'fbgemm_gpu']]], + ['sharedmemory_3c_20vec4t_3c_20at_3a_3aacc_5ftype_3c_20float_2c_20true_20_3e_20_3e_20_3e_6',['SharedMemory< Vec4T< at::acc_type< float, true > > >',['../structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01float_00_01true_01_4_01_4_01_4.html',1,'fbgemm_gpu']]], + ['stackarray_7',['StackArray',['../struct_stack_array.html',1,'']]], + ['stochasticroundingrngstate_8',['StochasticRoundingRNGState',['../structfbgemm__gpu_1_1_stochastic_rounding_r_n_g_state.html',1,'fbgemm_gpu']]] +]; diff --git a/search/classes_c.js b/search/classes_c.js new file mode 100644 index 000000000..c54250d05 --- /dev/null +++ b/search/classes_c.js @@ -0,0 +1,8 @@ +var searchData= +[ + ['tensoraccessor_0',['TensorAccessor',['../classfbgemm__gpu_1_1_tensor_accessor.html',1,'fbgemm_gpu']]], + ['tensoraccessor_3c_20t_2c_201_2c_20ptrtraits_2c_20index_5ft_20_3e_1',['TensorAccessor< T, 1, PtrTraits, index_t >',['../classfbgemm__gpu_1_1_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html',1,'fbgemm_gpu']]], + ['tensoraccessorbase_2',['TensorAccessorBase',['../classfbgemm__gpu_1_1_tensor_accessor_base.html',1,'fbgemm_gpu']]], + ['tensoraccessorbase_3c_20t_2c_201_2c_20ptrtraits_2c_20index_5ft_20_3e_3',['TensorAccessorBase< T, 1, PtrTraits, index_t >',['../classfbgemm__gpu_1_1_tensor_accessor_base.html',1,'fbgemm_gpu']]], + ['tensoraccessorbase_3c_20t_2c_20n_2c_20defaultptrtraits_2c_20int64_5ft_20_3e_4',['TensorAccessorBase< T, N, DefaultPtrTraits, int64_t >',['../classfbgemm__gpu_1_1_tensor_accessor_base.html',1,'fbgemm_gpu']]] +]; diff --git a/search/classes_d.js b/search/classes_d.js new file mode 100644 index 000000000..74ebb7a20 --- /dev/null +++ b/search/classes_d.js @@ -0,0 +1,24 @@ +var searchData= +[ + ['vec4acct_0',['Vec4AccT',['../structfbgemm__gpu_1_1_vec4_acc_t.html',1,'fbgemm_gpu']]], + ['vec4stept_1',['Vec4StepT',['../structfbgemm__gpu_1_1_vec4_step_t.html',1,'fbgemm_gpu']]], + ['vec4stept_3c_20step_2c_20at_3a_3ahalf_20_3e_2',['Vec4StepT< STEP, at::Half >',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html',1,'fbgemm_gpu']]], + ['vec4stept_3c_20step_2c_20float_20_3e_3',['Vec4StepT< STEP, float >',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html',1,'fbgemm_gpu']]], + ['vec4stept_3c_20step_2c_20uint8_5ft_20_3e_4',['Vec4StepT< STEP, uint8_t >',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html',1,'fbgemm_gpu']]], + ['vec4t_5',['Vec4T',['../structfbgemm__gpu_1_1_vec4_t.html',1,'fbgemm_gpu']]], + ['vec4t_3c_20at_3a_3abfloat16_20_3e_6',['Vec4T< at::BFloat16 >',['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html',1,'fbgemm_gpu']]], + ['vec4t_3c_20at_3a_3ahalf_20_3e_7',['Vec4T< at::Half >',['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html',1,'fbgemm_gpu']]], + ['vec4t_3c_20double_20_3e_8',['Vec4T< double >',['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html',1,'fbgemm_gpu']]], + ['vec4t_3c_20float_20_3e_9',['Vec4T< float >',['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html',1,'fbgemm_gpu']]], + ['vec4type_10',['Vec4Type',['../struct_vec4_type.html',1,'']]], + ['vec4type_3c_20at_3a_3ahalf_20_3e_11',['Vec4Type< at::Half >',['../struct_vec4_type_3_01at_1_1_half_01_4.html',1,'']]], + ['vec4type_3c_20float_20_3e_12',['Vec4Type< float >',['../struct_vec4_type_3_01float_01_4.html',1,'']]], + ['vec4type_3c_20uint8_5ft_20_3e_13',['Vec4Type< uint8_t >',['../struct_vec4_type_3_01uint8__t_01_4.html',1,'']]], + ['vecnt_14',['VecNT',['../structfbgemm__gpu_1_1_vec_n_t.html',1,'fbgemm_gpu']]], + ['vecnt_3c_201_2c_20primitivetype_3a_3afp_20_3e_15',['VecNT< 1, PrimitiveType::FP >',['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html',1,'fbgemm_gpu']]], + ['vecnt_3c_2016_2c_20primitivetype_3a_3aint_20_3e_16',['VecNT< 16, PrimitiveType::INT >',['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html',1,'fbgemm_gpu']]], + ['vecnt_3c_202_2c_20primitivetype_3a_3afp_20_3e_17',['VecNT< 2, PrimitiveType::FP >',['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html',1,'fbgemm_gpu']]], + ['vecnt_3c_204_2c_20primitivetype_3a_3afp_20_3e_18',['VecNT< 4, PrimitiveType::FP >',['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html',1,'fbgemm_gpu']]], + ['vecnt_3c_204_2c_20primitivetype_3a_3aint_20_3e_19',['VecNT< 4, PrimitiveType::INT >',['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html',1,'fbgemm_gpu']]], + ['vecnt_3c_208_2c_20primitivetype_3a_3aint_20_3e_20',['VecNT< 8, PrimitiveType::INT >',['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html',1,'fbgemm_gpu']]] +]; diff --git a/search/classes_e.js b/search/classes_e.js new file mode 100644 index 000000000..06bb6ac7f --- /dev/null +++ b/search/classes_e.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['weightrow_0',['WeightRow',['../structfbgemm__gpu_1_1_weight_row.html',1,'fbgemm_gpu']]] +]; diff --git a/search/defines_0.js b/search/defines_0.js new file mode 100644 index 000000000..e7fdd53bd --- /dev/null +++ b/search/defines_0.js @@ -0,0 +1,6 @@ +var searchData= +[ + ['_5f_5fhalf2_5fto_5fui_0',['__HALF2_TO_UI',['../fbgemm__cuda__utils_8cuh.html#ab78d230e0bbda883a8f34ca1e31d0929',1,'fbgemm_cuda_utils.cuh']]], + ['_5f_5fhas_5finclude_1',['__has_include',['../_c_make_c_compiler_id_8c.html#ae5510d82e4946f1656f4969911c54736',1,'__has_include: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#ae5510d82e4946f1656f4969911c54736',1,'__has_include: CMakeCXXCompilerId.cpp']]], + ['_5fdispatch_5femb_5fcache_5ftypes_2',['_DISPATCH_EMB_CACHE_TYPES',['../dispatch__macros_8h.html#a8a3aad8de22734b1397d813a855528e1',1,'dispatch_macros.h']]] +]; diff --git a/search/defines_1.js b/search/defines_1.js new file mode 100644 index 000000000..bad99df52 --- /dev/null +++ b/search/defines_1.js @@ -0,0 +1,6 @@ +var searchData= +[ + ['acc_5fadd_5for_5ffma_0',['ACC_ADD_OR_FMA',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#ada15471a8b1da6a3a43b940916fea71e',1,'ACC_ADD_OR_FMA: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ada15471a8b1da6a3a43b940916fea71e',1,'ACC_ADD_OR_FMA: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#ada15471a8b1da6a3a43b940916fea71e',1,'ACC_ADD_OR_FMA: embedding_forward_split_kernel_v2_template.cu']]], + ['architecture_5fid_1',['ARCHITECTURE_ID',['../_c_make_c_compiler_id_8c.html#aba35d0d200deaeb06aee95ca297acb28',1,'ARCHITECTURE_ID: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#aba35d0d200deaeb06aee95ca297acb28',1,'ARCHITECTURE_ID: CMakeCXXCompilerId.cpp']]], + ['at_5fx_2',['AT_X',['../fbgemm__tensor__accessor_8h.html#ac7d28de6473a715c6228c08b391476bb',1,'fbgemm_tensor_accessor.h']]] +]; diff --git a/search/defines_10.js b/search/defines_10.js new file mode 100644 index 000000000..09f4f4599 --- /dev/null +++ b/search/defines_10.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['x_0',['X',['../gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu'],['../embedding__forward__quantized__split__nbit__host__template_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: embedding_forward_quantized_split_nbit_host_template.cu'],['../embedding__forward__quantized__split__nbit__host__template_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: embedding_forward_quantized_split_nbit_host_template.cu'],['../embedding__forward__quantized__split__nbit__host__template_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: embedding_forward_quantized_split_nbit_host_template.cu'],['../embedding__forward__quantized__split__nbit__host__template_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: embedding_forward_quantized_split_nbit_host_template.cu'],['../embedding__forward__quantized__split__nbit__host__template_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: embedding_forward_quantized_split_nbit_host_template.cu'],['../embedding__forward__quantized__split__nbit__host__template_8cu.html#ae6cc33dae61d3333c3d2e6be5f9cf16e',1,'X: embedding_forward_quantized_split_nbit_host_template.cu']]] +]; diff --git a/search/defines_11.js b/search/defines_11.js new file mode 100644 index 000000000..ae5adc93e --- /dev/null +++ b/search/defines_11.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['y_0',['Y',['../gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html#acec51faeb0681c58de451cb9d59abe95',1,'Y: gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html#acec51faeb0681c58de451cb9d59abe95',1,'Y: gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu'],['../gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html#acec51faeb0681c58de451cb9d59abe95',1,'Y: gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu'],['../embedding__forward__quantized__split__nbit__host__template_8cu.html#acec51faeb0681c58de451cb9d59abe95',1,'Y: embedding_forward_quantized_split_nbit_host_template.cu']]] +]; diff --git a/search/defines_2.js b/search/defines_2.js new file mode 100644 index 000000000..b25e0482a --- /dev/null +++ b/search/defines_2.js @@ -0,0 +1,7 @@ +var searchData= +[ + ['c_5fversion_0',['C_VERSION',['../_c_make_c_compiler_id_8c.html#adaee3ee7c5a7a22451ea25e762e1d7d5',1,'CMakeCCompilerId.c']]], + ['compiler_5fid_1',['COMPILER_ID',['../_c_make_c_compiler_id_8c.html#a81dee0709ded976b2e0319239f72d174',1,'COMPILER_ID: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#a81dee0709ded976b2e0319239f72d174',1,'COMPILER_ID: CMakeCXXCompilerId.cpp']]], + ['cuda_5fcheck_2',['CUDA_CHECK',['../cuda__utils_8cuh.html#ad64d49299c3d240ae540a693ae38ca38',1,'cuda_utils.cuh']]], + ['cxx_5fstd_3',['CXX_STD',['../_c_make_c_x_x_compiler_id_8cpp.html#a34cc889e576a1ae6c84ae9e0a851ba21',1,'CMakeCXXCompilerId.cpp']]] +]; diff --git a/search/defines_3.js b/search/defines_3.js new file mode 100644 index 000000000..ba7645e60 --- /dev/null +++ b/search/defines_3.js @@ -0,0 +1,25 @@ +var searchData= +[ + ['dec_0',['DEC',['../_c_make_c_compiler_id_8c.html#ad1280362da42492bbc11aa78cbf776ad',1,'DEC: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#ad1280362da42492bbc11aa78cbf776ad',1,'DEC: CMakeCXXCompilerId.cpp']]], + ['decl_5fradix_5fsort_5fpairs_5ffn_1',['DECL_RADIX_SORT_PAIRS_FN',['../split__embeddings__utils_8cuh.html#a91fe9e10ff5c98fe4952c9c0986476b4',1,'split_embeddings_utils.cuh']]], + ['def_5fradix_5fsort_5fpairs_5ffn_2',['DEF_RADIX_SORT_PAIRS_FN',['../radix__sort__pairs_8cu.html#a4cf2c787c9111fdc77b98fcc9e690344',1,'radix_sort_pairs.cu']]], + ['device_5finline_3',['DEVICE_INLINE',['../fbgemm__cuda__utils_8cuh.html#a8888b6e919f4a14975d3110a7425407d',1,'fbgemm_cuda_utils.cuh']]], + ['dispatch_5fdense_5fto_5fjagged_5fcase_4',['DISPATCH_DENSE_TO_JAGGED_CASE',['../dense__to__jagged__forward_8cu.html#ab94a3e4679ece26e229ec76dc9733ca2',1,'dense_to_jagged_forward.cu']]], + ['dispatch_5femb_5fcache_5foutput_5ftypes_5',['DISPATCH_EMB_CACHE_OUTPUT_TYPES',['../dispatch__macros_8h.html#a8f06a63f75524d1985d76648b0fcf990',1,'dispatch_macros.h']]], + ['dispatch_5femb_5fcache_5ftypes_6',['DISPATCH_EMB_CACHE_TYPES',['../dispatch__macros_8h.html#ac4599e1c46b6eb357145dd791c6ae5c9',1,'dispatch_macros.h']]], + ['dispatch_5femb_5fgrad_5fcache_5ftypes_7',['DISPATCH_EMB_GRAD_CACHE_TYPES',['../dispatch__macros_8h.html#a10b99a9b7edecc89f4558ba0cf37c0ee',1,'dispatch_macros.h']]], + ['dispatch_5fkernel_5ffor_5fcache_5fcase_8',['DISPATCH_KERNEL_FOR_CACHE_CASE',['../gen__batch__index__select__dim0__forward__codegen__cuda_8cu.html#a285553bb10df1164c041a1cb931b44a8',1,'DISPATCH_KERNEL_FOR_CACHE_CASE: gen_batch_index_select_dim0_forward_codegen_cuda.cu'],['../gen__embedding__forward__dense__unweighted__codegen__cuda_8cu.html#a285553bb10df1164c041a1cb931b44a8',1,'DISPATCH_KERNEL_FOR_CACHE_CASE: gen_embedding_forward_dense_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__dense__weighted__codegen__cuda_8cu.html#a285553bb10df1164c041a1cb931b44a8',1,'DISPATCH_KERNEL_FOR_CACHE_CASE: gen_embedding_forward_dense_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#a285553bb10df1164c041a1cb931b44a8',1,'DISPATCH_KERNEL_FOR_CACHE_CASE: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#a285553bb10df1164c041a1cb931b44a8',1,'DISPATCH_KERNEL_FOR_CACHE_CASE: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#a285553bb10df1164c041a1cb931b44a8',1,'DISPATCH_KERNEL_FOR_CACHE_CASE: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#a285553bb10df1164c041a1cb931b44a8',1,'DISPATCH_KERNEL_FOR_CACHE_CASE: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../embedding__forward__split__template_8cu.html#a285553bb10df1164c041a1cb931b44a8',1,'DISPATCH_KERNEL_FOR_CACHE_CASE: embedding_forward_split_template.cu']]], + ['dispatch_5foptimal_5fforward_5fkernel_9',['DISPATCH_OPTIMAL_FORWARD_KERNEL',['../gen__batch__index__select__dim0__forward__codegen__cuda_8cu.html#abe51720e514c6a9d39c95bc2c72e1cd6',1,'DISPATCH_OPTIMAL_FORWARD_KERNEL: gen_batch_index_select_dim0_forward_codegen_cuda.cu'],['../gen__embedding__forward__dense__unweighted__codegen__cuda_8cu.html#abe51720e514c6a9d39c95bc2c72e1cd6',1,'DISPATCH_OPTIMAL_FORWARD_KERNEL: gen_embedding_forward_dense_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__dense__weighted__codegen__cuda_8cu.html#abe51720e514c6a9d39c95bc2c72e1cd6',1,'DISPATCH_OPTIMAL_FORWARD_KERNEL: gen_embedding_forward_dense_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#abe51720e514c6a9d39c95bc2c72e1cd6',1,'DISPATCH_OPTIMAL_FORWARD_KERNEL: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#abe51720e514c6a9d39c95bc2c72e1cd6',1,'DISPATCH_OPTIMAL_FORWARD_KERNEL: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#abe51720e514c6a9d39c95bc2c72e1cd6',1,'DISPATCH_OPTIMAL_FORWARD_KERNEL: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#abe51720e514c6a9d39c95bc2c72e1cd6',1,'DISPATCH_OPTIMAL_FORWARD_KERNEL: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../embedding__forward__split__template_8cu.html#abe51720e514c6a9d39c95bc2c72e1cd6',1,'DISPATCH_OPTIMAL_FORWARD_KERNEL: embedding_forward_split_template.cu']]], + ['dispatch_5foptimal_5fkernel_10',['DISPATCH_OPTIMAL_KERNEL',['../gen__batch__index__select__dim0__backward__codegen__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_batch_index_select_dim0_backward_codegen_cuda.cu'],['../gen__embedding__backward__adagrad__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__adagrad__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__adam__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_adam_split_unweighted_cuda.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_adam_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__adam__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_adam_split_weighted_cuda.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu'],['../gen__embedding__backward__dense__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_dense_split_unweighted_cuda.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_dense_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__dense__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_dense_split_weighted_cuda.cu'],['../gen__embedding__backward__lamb__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_lamb_split_unweighted_cuda.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_lamb_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__lamb__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_lamb_split_weighted_cuda.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_lars_sgd_split_unweighted_cuda.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_lars_sgd_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_lars_sgd_split_weighted_cuda.cu'],['../gen__embedding__backward__none__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_none_split_unweighted_cuda.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_none_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__none__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_none_split_weighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_partial_rowwise_adam_split_unweighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_partial_rowwise_adam_split_weighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_partial_rowwise_lamb_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__sgd__split__unweighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_sgd_split_unweighted_cuda.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_sgd_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_sgd_split_unweighted_vbe_cuda.cu'],['../gen__embedding__backward__sgd__split__weighted__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_sgd_split_weighted_cuda.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__cuda_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: gen_embedding_backward_sgd_split_weighted_vbe_cuda.cu'],['../embedding__backward__split__template_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: embedding_backward_split_template.cu'],['../embedding__backward__split__template_8cu.html#accce092d5cf27275da7d960efa6c6321',1,'DISPATCH_OPTIMAL_KERNEL: embedding_backward_split_template.cu']]], + ['dispatch_5foptimal_5fnobag_5fforward_5fkernel_11',['DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL',['../gen__batch__index__select__dim0__forward__codegen__cuda_8cu.html#a805da9b1e5a1c6e28a4d4c99501d1b1a',1,'DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL: gen_batch_index_select_dim0_forward_codegen_cuda.cu'],['../gen__embedding__forward__dense__unweighted__codegen__cuda_8cu.html#a805da9b1e5a1c6e28a4d4c99501d1b1a',1,'DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL: gen_embedding_forward_dense_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__dense__weighted__codegen__cuda_8cu.html#a805da9b1e5a1c6e28a4d4c99501d1b1a',1,'DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL: gen_embedding_forward_dense_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#a805da9b1e5a1c6e28a4d4c99501d1b1a',1,'DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#a805da9b1e5a1c6e28a4d4c99501d1b1a',1,'DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#a805da9b1e5a1c6e28a4d4c99501d1b1a',1,'DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#a805da9b1e5a1c6e28a4d4c99501d1b1a',1,'DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../embedding__forward__split__template_8cu.html#a805da9b1e5a1c6e28a4d4c99501d1b1a',1,'DISPATCH_OPTIMAL_NOBAG_FORWARD_KERNEL: embedding_forward_split_template.cu']]], + ['dispatch_5foutput_5ftypes_12',['DISPATCH_OUTPUT_TYPES',['../dispatch__macros_8h.html#a91c270ea1cbf887747abbaf8883b7175',1,'dispatch_macros.h']]], + ['dispatch_5fto_5fall_13',['DISPATCH_TO_ALL',['../sparse__ops__utils_8h.html#ae80e8b33bdef7d2849eb3d516ff67d1b',1,'sparse_ops_utils.h']]], + ['dispatch_5fto_5fautograd_14',['DISPATCH_TO_AUTOGRAD',['../sparse__ops__utils_8h.html#aab6390a9590ead03a896aae2b93a96ed',1,'sparse_ops_utils.h']]], + ['dispatch_5fto_5fautograd_5fcuda_15',['DISPATCH_TO_AUTOGRAD_CUDA',['../sparse__ops__utils_8h.html#adb242971e11b66b1f8f58c361e44b8e7',1,'sparse_ops_utils.h']]], + ['dispatch_5fto_5fautograd_5fmeta_16',['DISPATCH_TO_AUTOGRAD_META',['../sparse__ops__utils_8h.html#a8ed65710de63bd56275d2ceded5d59b4',1,'sparse_ops_utils.h']]], + ['dispatch_5fto_5fcpu_17',['DISPATCH_TO_CPU',['../sparse__ops__utils_8h.html#af5cf39897136f04c6f2ac5f3544c49c3',1,'sparse_ops_utils.h']]], + ['dispatch_5fto_5fcuda_18',['DISPATCH_TO_CUDA',['../sparse__ops__utils_8h.html#a06de50f3ede518ff59612c9ada5a85c8',1,'sparse_ops_utils.h']]], + ['dispatch_5fto_5fmeta_19',['DISPATCH_TO_META',['../sparse__ops__utils_8h.html#aa751218a0e9119ad6fa4d6d4df63fda5',1,'sparse_ops_utils.h']]], + ['div_5fround_5fup_20',['DIV_ROUND_UP',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a0f2b77785cbc55639ba4e4874a65426c',1,'DIV_ROUND_UP: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a0f2b77785cbc55639ba4e4874a65426c',1,'DIV_ROUND_UP: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a0f2b77785cbc55639ba4e4874a65426c',1,'DIV_ROUND_UP: embedding_forward_split_kernel_v2_template.cu']]], + ['dll_5fpublic_21',['DLL_PUBLIC',['../ops__utils_8h.html#a29047de4dfe891435d8254535634ac1d',1,'ops_utils.h']]] +]; diff --git a/search/defines_4.js b/search/defines_4.js new file mode 100644 index 000000000..c2f8d0985 --- /dev/null +++ b/search/defines_4.js @@ -0,0 +1,16 @@ +var searchData= +[ + ['fbgemm_5fdispatch_5ffloat_5fand_5fhalf_0',['FBGEMM_DISPATCH_FLOAT_AND_HALF',['../dispatch__macros_8h.html#a6db9b7506116844ae45993577c3b9ac4',1,'dispatch_macros.h']]], + ['fbgemm_5fdispatch_5ffloat_5fand_5fhalf_5fcase_1',['FBGEMM_DISPATCH_FLOAT_AND_HALF_CASE',['../dispatch__macros_8h.html#a60faa23c8d1bf9d75a2e598a5654ecff',1,'dispatch_macros.h']]], + ['fbgemm_5fdispatch_5ffloat_5fhalf_5fand_5fbfloat16_2',['FBGEMM_DISPATCH_FLOAT_HALF_AND_BFLOAT16',['../dispatch__macros_8h.html#ab9329efe2d7882cbc2bd358b6672c292',1,'dispatch_macros.h']]], + ['fbgemm_5fdispatch_5ffloat_5fhalf_5fand_5fbfloat16_5fcase_3',['FBGEMM_DISPATCH_FLOAT_HALF_AND_BFLOAT16_CASE',['../dispatch__macros_8h.html#a7c7e35b09a14b3d5b76339803712ce7e',1,'dispatch_macros.h']]], + ['fbgemm_5fgpu_5fcub_5fns_5fprefix_4',['FBGEMM_GPU_CUB_NS_PREFIX',['../cub__namespace__postfix_8cuh.html#a12567f2486c4686871a5330dbd8e9bb4',1,'cub_namespace_postfix.cuh']]], + ['fbgemm_5fgpu_5fenum_5fcreate_5ftag_5',['FBGEMM_GPU_ENUM_CREATE_TAG',['../enum__utils_8h.html#a769a65d91133d4f233bcf10280ff7a3c',1,'enum_utils.h']]], + ['fbgemm_5fgpu_5fenum_5fglogal_6',['FBGEMM_GPU_ENUM_GLOGAL',['../enum__utils_8h.html#adc8e24189b6f5a58092ade0b27e197b1',1,'enum_utils.h']]], + ['fbgemm_5fgpu_5fenum_5fitem_7',['FBGEMM_GPU_ENUM_ITEM',['../enum__utils_8h.html#aef8d28be61e5e22bac45bf59c53dabbd',1,'enum_utils.h']]], + ['fbgemm_5fgpu_5fenum_5fop_8',['FBGEMM_GPU_ENUM_OP',['../enum__utils_8h.html#abcc6d46ce5e5452b5b49f96ae0aa67f3',1,'enum_utils.h']]], + ['fbgemm_5fgpu_5fenum_5fregister_5fend_9',['FBGEMM_GPU_ENUM_REGISTER_END',['../enum__utils_8h.html#a1fc46fffc78f3820ce4668b6b2a92b55',1,'enum_utils.h']]], + ['fbgemm_5fgpu_5fenum_5fregister_5fstart_10',['FBGEMM_GPU_ENUM_REGISTER_START',['../enum__utils_8h.html#a3c1089cc9b549d33d50c20c14b348950',1,'enum_utils.h']]], + ['fbgemm_5fgpu_5fenum_5ftag_11',['FBGEMM_GPU_ENUM_TAG',['../enum__utils_8h.html#aae161db28429e0e2aa9001448f52e2f4',1,'enum_utils.h']]], + ['fbgemm_5fop_5fdispatch_12',['FBGEMM_OP_DISPATCH',['../ops__utils_8h.html#aed63a3f5bb9ae1c01f230bee2d95ea05',1,'ops_utils.h']]] +]; diff --git a/search/defines_5.js b/search/defines_5.js new file mode 100644 index 000000000..20f5d6416 --- /dev/null +++ b/search/defines_5.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['hex_0',['HEX',['../_c_make_c_compiler_id_8c.html#a46d5d95daa1bef867bd0179594310ed5',1,'HEX: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#a46d5d95daa1bef867bd0179594310ed5',1,'HEX: CMakeCXXCompilerId.cpp']]] +]; diff --git a/search/defines_6.js b/search/defines_6.js new file mode 100644 index 000000000..a0e92b2a4 --- /dev/null +++ b/search/defines_6.js @@ -0,0 +1,9 @@ +var searchData= +[ + ['instantiate_5fbatched_5fcsr2csc_0',['INSTANTIATE_BATCHED_CSR2CSC',['../embedding__forward__split__cpu_8cpp.html#a32da455953694aac0b5e837bd3f1c31a',1,'embedding_forward_split_cpu.cpp']]], + ['invoke_5fgroup_5findex_5fselect_5for_5fadd_1',['INVOKE_GROUP_INDEX_SELECT_OR_ADD',['../sparse__group__index_8cu.html#acc7197a16e3ef386f0fd807a0919110b',1,'sparse_group_index.cu']]], + ['invoke_5fkernel_5fwith_5fdim_2',['INVOKE_KERNEL_WITH_DIM',['../jagged__tensor__ops_2common_8cuh.html#ac4adf873a2fdf50491e9cc9647e3f6cc',1,'INVOKE_KERNEL_WITH_DIM: common.cuh'],['../jagged__tensor__ops_2common_8cuh.html#ac4adf873a2fdf50491e9cc9647e3f6cc',1,'INVOKE_KERNEL_WITH_DIM: common.cuh'],['../jagged__dense__dense__elementwise__add__jagged__output__forward_8cu.html#ac4adf873a2fdf50491e9cc9647e3f6cc',1,'INVOKE_KERNEL_WITH_DIM: jagged_dense_dense_elementwise_add_jagged_output_forward.cu'],['../jagged__dense__elementwise__mul__backward_8cu.html#ac4adf873a2fdf50491e9cc9647e3f6cc',1,'INVOKE_KERNEL_WITH_DIM: jagged_dense_elementwise_mul_backward.cu'],['../jagged__tensor__ops__cpu_8cpp.html#ac4adf873a2fdf50491e9cc9647e3f6cc',1,'INVOKE_KERNEL_WITH_DIM: jagged_tensor_ops_cpu.cpp'],['../jagged__tensor__ops__cpu_8cpp.html#ac4adf873a2fdf50491e9cc9647e3f6cc',1,'INVOKE_KERNEL_WITH_DIM: jagged_tensor_ops_cpu.cpp'],['../jagged__tensor__ops__cpu_8cpp.html#ac4adf873a2fdf50491e9cc9647e3f6cc',1,'INVOKE_KERNEL_WITH_DIM: jagged_tensor_ops_cpu.cpp']]], + ['invoke_5flinearize_5findex_5fkernel_3',['INVOKE_LINEARIZE_INDEX_KERNEL',['../transpose__embedding__input_8cu.html#ac03452638c5653f404a402f9f7356841',1,'transpose_embedding_input.cu']]], + ['invoke_5fprocess_5fall_5findices_4',['INVOKE_PROCESS_ALL_INDICES',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#acaeccb7e2e5908cef08556661b7a6f44',1,'INVOKE_PROCESS_ALL_INDICES: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#acaeccb7e2e5908cef08556661b7a6f44',1,'INVOKE_PROCESS_ALL_INDICES: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#acaeccb7e2e5908cef08556661b7a6f44',1,'INVOKE_PROCESS_ALL_INDICES: embedding_forward_split_kernel_v2_template.cu']]], + ['invoke_5fprocess_5fall_5findices_5fhelper_5',['INVOKE_PROCESS_ALL_INDICES_HELPER',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a65e818853d870f84ef24b703b0e02618',1,'INVOKE_PROCESS_ALL_INDICES_HELPER: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a65e818853d870f84ef24b703b0e02618',1,'INVOKE_PROCESS_ALL_INDICES_HELPER: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a65e818853d870f84ef24b703b0e02618',1,'INVOKE_PROCESS_ALL_INDICES_HELPER: embedding_forward_split_kernel_v2_template.cu']]] +]; diff --git a/search/defines_7.js b/search/defines_7.js new file mode 100644 index 000000000..ae1d1ff32 --- /dev/null +++ b/search/defines_7.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['jagged_5ftensor_5fdispatch_5fdims_0',['JAGGED_TENSOR_DISPATCH_DIMS',['../sparse__ops__utils_8h.html#a8f3cc6f3a1a83750715b4ddcb228ca8b',1,'sparse_ops_utils.h']]] +]; diff --git a/search/defines_8.js b/search/defines_8.js new file mode 100644 index 000000000..661f6dfe4 --- /dev/null +++ b/search/defines_8.js @@ -0,0 +1,7 @@ +var searchData= +[ + ['launch_5fauc_5fkernel_0',['LAUNCH_AUC_KERNEL',['../metric__ops_8cu.html#af8d70229cb61aff5f2f2e8f1abb10440',1,'metric_ops.cu']]], + ['launch_5findex_5fselect_1',['LAUNCH_INDEX_SELECT',['../sparse__index__select_8cu.html#a501f87ecefcbe28091d9a1c48499d3f6',1,'sparse_index_select.cu']]], + ['launch_5fkernel_2',['LAUNCH_KERNEL',['../keyed__jagged__index__select__dim1_8cu.html#a2ffb148e7bce97b5375e01ac265cc967',1,'keyed_jagged_index_select_dim1.cu']]], + ['ldg_3',['LDG',['../sparse__ops_2common_8cuh.html#a9e7ecd25c1168b19568b2ba40a731c39',1,'common.cuh']]] +]; diff --git a/search/defines_9.js b/search/defines_9.js new file mode 100644 index 000000000..4b04de124 --- /dev/null +++ b/search/defines_9.js @@ -0,0 +1,9 @@ +var searchData= +[ + ['make_5fpacked_5ftensor_5faccessor_5facc_5ftype_5fbase_0',['MAKE_PACKED_TENSOR_ACCESSOR_ACC_TYPE_BASE',['../fbgemm__tensor__accessor_8h.html#ae2a2547758e08761f973874a074b4fc1',1,'fbgemm_tensor_accessor.h']]], + ['make_5fpacked_5ftensor_5faccessor_5fbase_1',['MAKE_PACKED_TENSOR_ACCESSOR_BASE',['../fbgemm__tensor__accessor_8h.html#ad5bf508fef6a8c9528a8f1c316bfd491',1,'fbgemm_tensor_accessor.h']]], + ['make_5fpta_5facc_5fwith_5fname_2',['MAKE_PTA_ACC_WITH_NAME',['../fbgemm__tensor__accessor_8h.html#a23a5f2ae4f72b11bd67c678ae14d9af7',1,'fbgemm_tensor_accessor.h']]], + ['make_5fpta_5fwith_5fname_3',['MAKE_PTA_WITH_NAME',['../fbgemm__tensor__accessor_8h.html#a614f4b016e2758186bd598bc3be6e6cf',1,'fbgemm_tensor_accessor.h']]], + ['max_4',['max',['../fbgemm__cuda__utils_8cuh.html#affe776513b24d84b39af8ab0930fef7f',1,'fbgemm_cuda_utils.cuh']]], + ['min_5',['min',['../fbgemm__cuda__utils_8cuh.html#ac6afabdc09a49a433ee19d8a9486056d',1,'fbgemm_cuda_utils.cuh']]] +]; diff --git a/search/defines_a.js b/search/defines_a.js new file mode 100644 index 000000000..67233fa7f --- /dev/null +++ b/search/defines_a.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['nvml_5fcheck_0',['NVML_CHECK',['../topology__utils_8cpp.html#af1ec00426a14a4658189ab308ea76636',1,'topology_utils.cpp']]] +]; diff --git a/search/defines_b.js b/search/defines_b.js new file mode 100644 index 000000000..4af4ced91 --- /dev/null +++ b/search/defines_b.js @@ -0,0 +1,10 @@ +var searchData= +[ + ['platform_5fid_0',['PLATFORM_ID',['../_c_make_c_compiler_id_8c.html#adbc5372f40838899018fadbc89bd588b',1,'PLATFORM_ID: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#adbc5372f40838899018fadbc89bd588b',1,'PLATFORM_ID: CMakeCXXCompilerId.cpp']]], + ['private_5fcase_5ftype_5fcache_1',['PRIVATE_CASE_TYPE_CACHE',['../dispatch__macros_8h.html#ab66dce26ee489c79f3a0441be14902fa',1,'dispatch_macros.h']]], + ['private_5fcase_5ftype_5fcache_5femb_2',['PRIVATE_CASE_TYPE_CACHE_EMB',['../dispatch__macros_8h.html#a98d43954b688bc60b943227d761487b3',1,'dispatch_macros.h']]], + ['private_5fcase_5ftype_5femb_3',['PRIVATE_CASE_TYPE_EMB',['../dispatch__macros_8h.html#af2c9e16b5345c0cdb6611357e0ec15db',1,'dispatch_macros.h']]], + ['private_5fcase_5ftype_5foutput_4',['PRIVATE_CASE_TYPE_OUTPUT',['../dispatch__macros_8h.html#a3905d2ceab136e10c35a2ff4fe29a7d0',1,'dispatch_macros.h']]], + ['private_5fcase_5ftype_5foutput2_5',['PRIVATE_CASE_TYPE_OUTPUT2',['../dispatch__macros_8h.html#a17577aa7f884011133210418a790641a',1,'dispatch_macros.h']]], + ['pt2_5fcompliant_5ftag_6',['PT2_COMPLIANT_TAG',['../dispatch__macros_8h.html#a3b8ceecef1ba0067d90eea1764298cda',1,'dispatch_macros.h']]] +]; diff --git a/search/defines_c.js b/search/defines_c.js new file mode 100644 index 000000000..f6d2f1cfc --- /dev/null +++ b/search/defines_c.js @@ -0,0 +1,5 @@ +var searchData= +[ + ['quantize_5fops_5fmax_0',['QUANTIZE_OPS_MAX',['../quantize__ops_2common_8cuh.html#ac84aa8e4e97b2a4675ec853e802ec4c6',1,'common.cuh']]], + ['quantize_5fops_5fmin_1',['QUANTIZE_OPS_MIN',['../quantize__ops_2common_8cuh.html#a7c9f79708fed845d68b88205e5a1c70c',1,'common.cuh']]] +]; diff --git a/search/defines_d.js b/search/defines_d.js new file mode 100644 index 000000000..be30b1401 --- /dev/null +++ b/search/defines_d.js @@ -0,0 +1,13 @@ +var searchData= +[ + ['shfl_5fsync_0',['SHFL_SYNC',['../embedding__forward__template__helpers_8cuh.html#adce6eee5db9c1c3f52ff15d9fe263495',1,'SHFL_SYNC: embedding_forward_template_helpers.cuh'],['../embedding__backward__template__helpers_8cuh.html#adce6eee5db9c1c3f52ff15d9fe263495',1,'SHFL_SYNC: embedding_backward_template_helpers.cuh']]], + ['smem_5fcache_5fweight_5fdata_1',['SMEM_CACHE_WEIGHT_DATA',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a9e947cce4a2cf3d4f94feeaf6024a3e3',1,'SMEM_CACHE_WEIGHT_DATA: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a9e947cce4a2cf3d4f94feeaf6024a3e3',1,'SMEM_CACHE_WEIGHT_DATA: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a9e947cce4a2cf3d4f94feeaf6024a3e3',1,'SMEM_CACHE_WEIGHT_DATA: embedding_forward_split_kernel_v2_template.cu']]], + ['smem_5fcache_5fweight_5fptr_2',['SMEM_CACHE_WEIGHT_PTR',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a8f5221b4fcc0397e5c260e567afd000f',1,'SMEM_CACHE_WEIGHT_PTR: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a8f5221b4fcc0397e5c260e567afd000f',1,'SMEM_CACHE_WEIGHT_PTR: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a8f5221b4fcc0397e5c260e567afd000f',1,'SMEM_CACHE_WEIGHT_PTR: embedding_forward_split_kernel_v2_template.cu']]], + ['smem_5femb_5fweight_5fdata_3',['SMEM_EMB_WEIGHT_DATA',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a741fa81333f21f397dd7bcb524345f77',1,'SMEM_EMB_WEIGHT_DATA: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a741fa81333f21f397dd7bcb524345f77',1,'SMEM_EMB_WEIGHT_DATA: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a741fa81333f21f397dd7bcb524345f77',1,'SMEM_EMB_WEIGHT_DATA: embedding_forward_split_kernel_v2_template.cu']]], + ['smem_5femb_5fweight_5fptr_4',['SMEM_EMB_WEIGHT_PTR',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a240239b93a27d2333aba0661096e3f2f',1,'SMEM_EMB_WEIGHT_PTR: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a240239b93a27d2333aba0661096e3f2f',1,'SMEM_EMB_WEIGHT_PTR: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a240239b93a27d2333aba0661096e3f2f',1,'SMEM_EMB_WEIGHT_PTR: embedding_forward_split_kernel_v2_template.cu']]], + ['smem_5fgeneric_5fptr_5',['SMEM_GENERIC_PTR',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a82d4ccecee745b4cadb5d2d04e986efc',1,'SMEM_GENERIC_PTR: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a82d4ccecee745b4cadb5d2d04e986efc',1,'SMEM_GENERIC_PTR: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a82d4ccecee745b4cadb5d2d04e986efc',1,'SMEM_GENERIC_PTR: embedding_forward_split_kernel_v2_template.cu']]], + ['smem_5foffset_6',['SMEM_OFFSET',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a5c92b1dfe0de84f52323da3897cb0bb4',1,'SMEM_OFFSET: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a5c92b1dfe0de84f52323da3897cb0bb4',1,'SMEM_OFFSET: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a5c92b1dfe0de84f52323da3897cb0bb4',1,'SMEM_OFFSET: embedding_forward_split_kernel_v2_template.cu']]], + ['smem_5fptr_5fbase_7',['SMEM_PTR_BASE',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aec0d9a647d3bde05780ff426af9ebf45',1,'SMEM_PTR_BASE: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aec0d9a647d3bde05780ff426af9ebf45',1,'SMEM_PTR_BASE: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#aec0d9a647d3bde05780ff426af9ebf45',1,'SMEM_PTR_BASE: embedding_forward_split_kernel_v2_template.cu']]], + ['stringify_8',['STRINGIFY',['../_c_make_c_compiler_id_8c.html#a43e1cad902b6477bec893cb6430bd6c8',1,'STRINGIFY: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#a43e1cad902b6477bec893cb6430bd6c8',1,'STRINGIFY: CMakeCXXCompilerId.cpp']]], + ['stringify_5fhelper_9',['STRINGIFY_HELPER',['../_c_make_c_compiler_id_8c.html#a2ae9b72bb13abaabfcf2ee0ba7d3fa1d',1,'STRINGIFY_HELPER: CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#a2ae9b72bb13abaabfcf2ee0ba7d3fa1d',1,'STRINGIFY_HELPER: CMakeCXXCompilerId.cpp']]] +]; diff --git a/search/defines_e.js b/search/defines_e.js new file mode 100644 index 000000000..59db48377 --- /dev/null +++ b/search/defines_e.js @@ -0,0 +1,19 @@ +var searchData= +[ + ['tensor_5fcontiguous_0',['TENSOR_CONTIGUOUS',['../sparse__ops__utils_8h.html#a333341c9590667c47753510e0da7b6e3',1,'sparse_ops_utils.h']]], + ['tensor_5fcontiguous_5fand_5fon_5fcpu_1',['TENSOR_CONTIGUOUS_AND_ON_CPU',['../sparse__ops__utils_8h.html#a0378cd5f9e716f13079b83a9b9805691',1,'sparse_ops_utils.h']]], + ['tensor_5fcontiguous_5fand_5fon_5fcuda_5fgpu_2',['TENSOR_CONTIGUOUS_AND_ON_CUDA_GPU',['../sparse__ops__utils_8h.html#a350ade6aa989687c2ca8ced000e200ff',1,'sparse_ops_utils.h']]], + ['tensor_5fempty_5for_5fon_5fcpu_3',['TENSOR_EMPTY_OR_ON_CPU',['../sparse__ops__utils_8h.html#a73ab1987fec37ac982ae1ed77be0e3ea',1,'sparse_ops_utils.h']]], + ['tensor_5fempty_5for_5fon_5fcuda_5fgpu_4',['TENSOR_EMPTY_OR_ON_CUDA_GPU',['../sparse__ops__utils_8h.html#aff83e4ada08cf70146ffc4ac2009aa9a',1,'sparse_ops_utils.h']]], + ['tensor_5fndim_5fequals_5',['TENSOR_NDIM_EQUALS',['../sparse__ops__utils_8h.html#a485f848acf189619cb61a0ae7534eaa1',1,'sparse_ops_utils.h']]], + ['tensor_5fndim_5fexceeds_6',['TENSOR_NDIM_EXCEEDS',['../sparse__ops__utils_8h.html#acfab048550cb0518bdb1ac267ef1e7ba',1,'sparse_ops_utils.h']]], + ['tensor_5fndim_5fis_5fge_7',['TENSOR_NDIM_IS_GE',['../sparse__ops__utils_8h.html#abd9e69a82885e6e361275a0b08ebe565',1,'sparse_ops_utils.h']]], + ['tensor_5fon_5fcpu_8',['TENSOR_ON_CPU',['../sparse__ops__utils_8h.html#a5d19d4051835acd2c6d83eb637341010',1,'sparse_ops_utils.h']]], + ['tensor_5fon_5fcuda_5fgpu_9',['TENSOR_ON_CUDA_GPU',['../sparse__ops__utils_8h.html#ac6089c2908cb1ae6367af5cf7bbea30d',1,'sparse_ops_utils.h']]], + ['tensor_5ftype_5fmust_5fbe_10',['TENSOR_TYPE_MUST_BE',['../sparse__ops__utils_8h.html#a003b5640cfa59fe8f5da9b1c9fcb8f26',1,'sparse_ops_utils.h']]], + ['tensors_5fempty_5for_5fon_5fsame_5fdevice_11',['TENSORS_EMPTY_OR_ON_SAME_DEVICE',['../sparse__ops__utils_8h.html#a3df91ae56fe10d1c002bed63e5b78d1b',1,'sparse_ops_utils.h']]], + ['tensors_5fhave_5fsame_5fnumel_12',['TENSORS_HAVE_SAME_NUMEL',['../sparse__ops__utils_8h.html#a9be1e573e7d3e35f3db03210e2624e61',1,'sparse_ops_utils.h']]], + ['tensors_5fhave_5fsame_5ftype_13',['TENSORS_HAVE_SAME_TYPE',['../sparse__ops__utils_8h.html#a97687675a3398d3168fe8f07a1b4db87',1,'sparse_ops_utils.h']]], + ['tensors_5fon_5fsame_5fcuda_5fgpu_5fif_5fnot_5foptional_14',['TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL',['../sparse__ops__utils_8h.html#a4724e1d67266b6998b8fe4ef1ec743d9',1,'sparse_ops_utils.h']]], + ['tensors_5fon_5fsame_5fdevice_15',['TENSORS_ON_SAME_DEVICE',['../sparse__ops__utils_8h.html#aa6ef8e13e3280066cc5f4f0970d3e7a6',1,'sparse_ops_utils.h']]] +]; diff --git a/search/defines_f.js b/search/defines_f.js new file mode 100644 index 000000000..172a46f56 --- /dev/null +++ b/search/defines_f.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['weight_5foffset_0',['WEIGHT_OFFSET',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a1e90593b9eb03be49ddd5e3e5473f0b5',1,'WEIGHT_OFFSET: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a1e90593b9eb03be49ddd5e3e5473f0b5',1,'WEIGHT_OFFSET: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a1e90593b9eb03be49ddd5e3e5473f0b5',1,'WEIGHT_OFFSET: embedding_forward_split_kernel_v2_template.cu']]] +]; diff --git a/search/enums_0.js b/search/enums_0.js new file mode 100644 index 000000000..fe4c5e166 --- /dev/null +++ b/search/enums_0.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['args_5fpos_0',['args_pos',['../namespacefbgemm__gpu.html#afcbf1cd70ce8ea074c2e799d1559b396',1,'fbgemm_gpu']]] +]; diff --git a/search/enums_1.js b/search/enums_1.js new file mode 100644 index 000000000..846798e5e --- /dev/null +++ b/search/enums_1.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['boundscheckmode_0',['BoundsCheckMode',['../namespacefbgemm__gpu.html#a70433200cf584e2429434a33d45111ea',1,'fbgemm_gpu']]] +]; diff --git a/search/enums_2.js b/search/enums_2.js new file mode 100644 index 000000000..71800ea52 --- /dev/null +++ b/search/enums_2.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['lxu_5fcache_5fparams_0',['LXU_CACHE_PARAMS',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#ad863bc0421e111195e2ac11c7ad2071d',1,'LXU_CACHE_PARAMS: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ad863bc0421e111195e2ac11c7ad2071d',1,'LXU_CACHE_PARAMS: gen_embedding_forward_split_weighted_v2_kernel.cu']]] +]; diff --git a/search/enums_3.js b/search/enums_3.js new file mode 100644 index 000000000..133e43282 --- /dev/null +++ b/search/enums_3.js @@ -0,0 +1,6 @@ +var searchData= +[ + ['placementtype_0',['PlacementType',['../namespacefbgemm__gpu.html#a8f04cbe33fa88d1e420c06b1f8879194',1,'fbgemm_gpu']]], + ['poolingmode_1',['PoolingMode',['../namespacefbgemm__gpu.html#aa1f721fe0d5e5a710e7a05f788f01f5d',1,'fbgemm_gpu']]], + ['primitivetype_2',['PrimitiveType',['../namespacefbgemm__gpu.html#aa7e45742197542f659233c21b883ba60',1,'fbgemm_gpu']]] +]; diff --git a/search/enums_4.js b/search/enums_4.js new file mode 100644 index 000000000..810cb42e4 --- /dev/null +++ b/search/enums_4.js @@ -0,0 +1,5 @@ +var searchData= +[ + ['saved_5fparams_0',['SAVED_PARAMS',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54',1,'SAVED_PARAMS: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54',1,'SAVED_PARAMS: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54',1,'SAVED_PARAMS: embedding_forward_split_kernel_v2_template.cu']]], + ['sparsetype_1',['SparseType',['../namespacefbgemm__gpu.html#a47b4476e5f749d63e15d2f8e55be833e',1,'fbgemm_gpu']]] +]; diff --git a/search/enums_5.js b/search/enums_5.js new file mode 100644 index 000000000..d8ad7e7e1 --- /dev/null +++ b/search/enums_5.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['uvm_5fcache_5fstats_5findex_0',['uvm_cache_stats_index',['../namespacefbgemm__gpu.html#aefeeb0d13ba9b557b8d693c43e5a43aa',1,'fbgemm_gpu']]] +]; diff --git a/search/enumvalues_0.js b/search/enumvalues_0.js new file mode 100644 index 000000000..6bf7323dc --- /dev/null +++ b/search/enumvalues_0.js @@ -0,0 +1,5 @@ +var searchData= +[ + ['bf_0',['BF',['../namespacefbgemm__gpu.html#aa7e45742197542f659233c21b883ba60a7b8d2f92148f52cad46e331936922e80',1,'fbgemm_gpu']]], + ['bf16_1',['BF16',['../namespacefbgemm__gpu.html#a47b4476e5f749d63e15d2f8e55be833eaf656bbf613964dcf710b771b0918ab30',1,'fbgemm_gpu']]] +]; diff --git a/search/enumvalues_1.js b/search/enumvalues_1.js new file mode 100644 index 000000000..307736a7c --- /dev/null +++ b/search/enumvalues_1.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['device_0',['DEVICE',['../namespacefbgemm__gpu.html#a8f04cbe33fa88d1e420c06b1f8879194ae10b6ab6a278644ce40631f62f360b6d',1,'fbgemm_gpu']]] +]; diff --git a/search/enumvalues_2.js b/search/enumvalues_2.js new file mode 100644 index 000000000..c4c78f28b --- /dev/null +++ b/search/enumvalues_2.js @@ -0,0 +1,8 @@ +var searchData= +[ + ['fatal_0',['FATAL',['../namespacefbgemm__gpu.html#a70433200cf584e2429434a33d45111eaa19da7170bea36556dde582519795f3fc',1,'fbgemm_gpu']]], + ['fp_1',['FP',['../namespacefbgemm__gpu.html#aa7e45742197542f659233c21b883ba60a4ebada6a2af2bcba53ded1d7b414f081',1,'fbgemm_gpu']]], + ['fp16_2',['FP16',['../namespacefbgemm__gpu.html#a47b4476e5f749d63e15d2f8e55be833eaa4bf99d6945c25077fd6660d536af8a0',1,'fbgemm_gpu']]], + ['fp32_3',['FP32',['../namespacefbgemm__gpu.html#a47b4476e5f749d63e15d2f8e55be833ea693aa0bef84c25fe81c7e62e72f9313d',1,'fbgemm_gpu']]], + ['fp8_4',['FP8',['../namespacefbgemm__gpu.html#a47b4476e5f749d63e15d2f8e55be833eae32efd813b88548940f8718a61864cf5',1,'fbgemm_gpu']]] +]; diff --git a/search/enumvalues_3.js b/search/enumvalues_3.js new file mode 100644 index 000000000..b907f0d6c --- /dev/null +++ b/search/enumvalues_3.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['host_0',['HOST',['../namespacefbgemm__gpu.html#a8f04cbe33fa88d1e420c06b1f8879194ab9361011891280a44d85b967739cc6a5',1,'fbgemm_gpu']]] +]; diff --git a/search/enumvalues_4.js b/search/enumvalues_4.js new file mode 100644 index 000000000..ce6bc7a08 --- /dev/null +++ b/search/enumvalues_4.js @@ -0,0 +1,9 @@ +var searchData= +[ + ['ignore_0',['IGNORE',['../namespacefbgemm__gpu.html#a70433200cf584e2429434a33d45111eaaa2e843feab94ef623fea888f07c28696',1,'fbgemm_gpu']]], + ['int_1',['INT',['../namespacefbgemm__gpu.html#aa7e45742197542f659233c21b883ba60a53f93baa3057821107c750323892fa92',1,'fbgemm_gpu']]], + ['int2_2',['INT2',['../namespacefbgemm__gpu.html#a47b4476e5f749d63e15d2f8e55be833ea8fbf1fab49398b0d298699ea3ccbebc5',1,'fbgemm_gpu']]], + ['int4_3',['INT4',['../namespacefbgemm__gpu.html#a47b4476e5f749d63e15d2f8e55be833ea94635600f8a63640263a5ebc30d79a2a',1,'fbgemm_gpu']]], + ['int8_4',['INT8',['../namespacefbgemm__gpu.html#a47b4476e5f749d63e15d2f8e55be833eaee9d73311ff0658494edfff14c3ec1e3',1,'fbgemm_gpu']]], + ['invalid_5',['INVALID',['../namespacefbgemm__gpu.html#a47b4476e5f749d63e15d2f8e55be833eaccc0377a8afbf50e7094f5c23a8af223',1,'fbgemm_gpu']]] +]; diff --git a/search/enumvalues_5.js b/search/enumvalues_5.js new file mode 100644 index 000000000..53c5d1efb --- /dev/null +++ b/search/enumvalues_5.js @@ -0,0 +1,6 @@ +var searchData= +[ + ['managed_0',['MANAGED',['../namespacefbgemm__gpu.html#a8f04cbe33fa88d1e420c06b1f8879194af59a25f2594f469f0bfccad7f8f13744',1,'fbgemm_gpu']]], + ['managed_5fcaching_1',['MANAGED_CACHING',['../namespacefbgemm__gpu.html#a8f04cbe33fa88d1e420c06b1f8879194a3664f93edf39a3e7e0a84f3cefb624a6',1,'fbgemm_gpu']]], + ['mean_2',['MEAN',['../namespacefbgemm__gpu.html#aa1f721fe0d5e5a710e7a05f788f01f5da4ea6d1161ea24d7599365f574aff6610',1,'fbgemm_gpu']]] +]; diff --git a/search/enumvalues_6.js b/search/enumvalues_6.js new file mode 100644 index 000000000..f7d8517d9 --- /dev/null +++ b/search/enumvalues_6.js @@ -0,0 +1,10 @@ +var searchData= +[ + ['none_0',['NONE',['../namespacefbgemm__gpu.html#aa1f721fe0d5e5a710e7a05f788f01f5dab50339a10e1de285ac99d4c3990b8693',1,'fbgemm_gpu']]], + ['num_5fcalls_1',['num_calls',['../namespacefbgemm__gpu.html#aefeeb0d13ba9b557b8d693c43e5a43aaadaf139c74384603431fd1bbb3347aa34',1,'fbgemm_gpu']]], + ['num_5fconflict_5fmisses_2',['num_conflict_misses',['../namespacefbgemm__gpu.html#aefeeb0d13ba9b557b8d693c43e5a43aaac0cd9dffdb3c001656bee52db850d1c6',1,'fbgemm_gpu']]], + ['num_5fconflict_5funique_5fmisses_3',['num_conflict_unique_misses',['../namespacefbgemm__gpu.html#aefeeb0d13ba9b557b8d693c43e5a43aaa30ee3b3c17bbfefe571f4ea5e99b00d6',1,'fbgemm_gpu']]], + ['num_5frequested_5findices_4',['num_requested_indices',['../namespacefbgemm__gpu.html#aefeeb0d13ba9b557b8d693c43e5a43aaacf3fcf7ace9b3a5b4ab424c874b84439',1,'fbgemm_gpu']]], + ['num_5funique_5findices_5',['num_unique_indices',['../namespacefbgemm__gpu.html#aefeeb0d13ba9b557b8d693c43e5a43aaaa555e0f1fe32e24cc25b049fdf3d0afc',1,'fbgemm_gpu']]], + ['num_5funique_5fmisses_6',['num_unique_misses',['../namespacefbgemm__gpu.html#aefeeb0d13ba9b557b8d693c43e5a43aaaabea3db589a421890b799e0ac63dfc53',1,'fbgemm_gpu']]] +]; diff --git a/search/enumvalues_7.js b/search/enumvalues_7.js new file mode 100644 index 000000000..ddc871e83 --- /dev/null +++ b/search/enumvalues_7.js @@ -0,0 +1,20 @@ +var searchData= +[ + ['p_5findex_5fweights_0',['P_index_weights',['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54ae6fddad64ad96f09ab2bf8e417dcab18',1,'gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['p_5findices_1',['P_indices',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a4b8443d24ef0d6d8b29d1de191b5fa20',1,'P_indices: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a4b8443d24ef0d6d8b29d1de191b5fa20',1,'P_indices: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a4b8443d24ef0d6d8b29d1de191b5fa20',1,'P_indices: embedding_forward_split_kernel_v2_template.cu']]], + ['p_5findices_5fis_5flong_2',['P_indices_is_long',['../namespacefbgemm__gpu.html#afcbf1cd70ce8ea074c2e799d1559b396ac640586328f5125ff8881c6b93fac125',1,'fbgemm_gpu']]], + ['p_5findices_5foffsets_3',['P_indices_offsets',['../namespacefbgemm__gpu.html#afcbf1cd70ce8ea074c2e799d1559b396a5f3a87c5dbebfaefd128c19ebbe6c7de',1,'fbgemm_gpu']]], + ['p_5findices_5fprts_4',['P_indices_prts',['../namespacefbgemm__gpu.html#afcbf1cd70ce8ea074c2e799d1559b396a8ae3847f58b98ba0ff4b0fcdfb4ae8e6',1,'fbgemm_gpu']]], + ['p_5flengths_5faddrs_5',['P_lengths_addrs',['../namespacefbgemm__gpu.html#afcbf1cd70ce8ea074c2e799d1559b396a66aa4e0ec73344232b5d56ee78ef17b0',1,'fbgemm_gpu']]], + ['p_5flengths_5fis_5flong_6',['P_lengths_is_long',['../namespacefbgemm__gpu.html#afcbf1cd70ce8ea074c2e799d1559b396a1c841401de519f97ca671d064c22250e',1,'fbgemm_gpu']]], + ['p_5flengths_5foffsets_7',['P_lengths_offsets',['../namespacefbgemm__gpu.html#afcbf1cd70ce8ea074c2e799d1559b396ad300b64361a3f3e756bfa78fd0b23b97',1,'fbgemm_gpu']]], + ['p_5fload_5fd_8',['P_load_D',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a8cfa61b080ef7d26fbe3b8d150b04834',1,'P_load_D: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a8cfa61b080ef7d26fbe3b8d150b04834',1,'P_load_D: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a8cfa61b080ef7d26fbe3b8d150b04834',1,'P_load_D: embedding_forward_split_kernel_v2_template.cu']]], + ['p_5flxu_5fcache_5flocations_9',['P_lxu_cache_locations',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#ad863bc0421e111195e2ac11c7ad2071da9e6d36a61249ee13ac61fee16a76d83c',1,'P_lxu_cache_locations: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ad863bc0421e111195e2ac11c7ad2071da9e6d36a61249ee13ac61fee16a76d83c',1,'P_lxu_cache_locations: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['p_5flxu_5fcache_5fweights_10',['P_lxu_cache_weights',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#ad863bc0421e111195e2ac11c7ad2071daf09c8e1f82af5f3e97070537dec964e0',1,'P_lxu_cache_weights: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ad863bc0421e111195e2ac11c7ad2071daf09c8e1f82af5f3e97070537dec964e0',1,'P_lxu_cache_weights: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['p_5fnum_5foffsets_11',['P_num_offsets',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54aecf1052c404b0ca815cb290cb8854144',1,'P_num_offsets: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54aecf1052c404b0ca815cb290cb8854144',1,'P_num_offsets: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54aecf1052c404b0ca815cb290cb8854144',1,'P_num_offsets: embedding_forward_split_kernel_v2_template.cu']]], + ['p_5foffsets_12',['P_offsets',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a7fcce188570ec66dece71f0da186e029',1,'P_offsets: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a7fcce188570ec66dece71f0da186e029',1,'P_offsets: gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['p_5foutputs_13',['P_outputs',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a00d2586446417c7ba88c313f0901f3da',1,'P_outputs: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a00d2586446417c7ba88c313f0901f3da',1,'P_outputs: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a00d2586446417c7ba88c313f0901f3da',1,'P_outputs: embedding_forward_split_kernel_v2_template.cu']]], + ['p_5fper_5fsample_5fweight_14',['P_per_sample_weight',['../namespacefbgemm__gpu.html#afcbf1cd70ce8ea074c2e799d1559b396ae38edd0733e3ec3ca85cfa8bd9b8ac93',1,'fbgemm_gpu']]], + ['p_5ftotal_5fload_5fd_15',['P_total_load_D',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a26b29347efcf14fcee3eef781e755ea2',1,'P_total_load_D: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a26b29347efcf14fcee3eef781e755ea2',1,'P_total_load_D: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54a26b29347efcf14fcee3eef781e755ea2',1,'P_total_load_D: embedding_forward_split_kernel_v2_template.cu']]], + ['p_5fweights_16',['P_weights',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54aeea99260eda72bf0110c9b54f0ebcb52',1,'P_weights: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54aeea99260eda72bf0110c9b54f0ebcb52',1,'P_weights: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#aa8c3f67d852b0552ccbe30a899cd4c54aeea99260eda72bf0110c9b54f0ebcb52',1,'P_weights: embedding_forward_split_kernel_v2_template.cu']]] +]; diff --git a/search/enumvalues_8.js b/search/enumvalues_8.js new file mode 100644 index 000000000..2ecf9cd7f --- /dev/null +++ b/search/enumvalues_8.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['sum_0',['SUM',['../namespacefbgemm__gpu.html#aa1f721fe0d5e5a710e7a05f788f01f5da6970bdc2201030b9c03fbdcf3973858a',1,'fbgemm_gpu']]] +]; diff --git a/search/enumvalues_9.js b/search/enumvalues_9.js new file mode 100644 index 000000000..312eea4b7 --- /dev/null +++ b/search/enumvalues_9.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['value_0',['value',['../structlog2__calc__.html#a06fc87d81c62e9abb8790b6e5713c55ba97de9ab6885342a574053b8f64a563a9',1,'log2_calc_::value'],['../structlog2__calc___3_010_01_4.html#adf764cbdea00d65edcd07bb9953ad2b7a97de9ab6885342a574053b8f64a563a9',1,'log2_calc_< 0 >::value'],['../structlog2__calc.html#a99fb83031ce9923c84392b4e92f956b5a97de9ab6885342a574053b8f64a563a9',1,'log2_calc::value']]] +]; diff --git a/search/enumvalues_a.js b/search/enumvalues_a.js new file mode 100644 index 000000000..3d76361c2 --- /dev/null +++ b/search/enumvalues_a.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['warning_0',['WARNING',['../namespacefbgemm__gpu.html#a70433200cf584e2429434a33d45111eaa059e9861e0400dfbe05c98a841f3f96b',1,'fbgemm_gpu']]] +]; diff --git a/search/files_0.js b/search/files_0.js new file mode 100644 index 000000000..a0ba37c73 --- /dev/null +++ b/search/files_0.js @@ -0,0 +1,8 @@ +var searchData= +[ + ['batch_5findex_5fselect_5fdim0_5fcpu_5fhost_2ecpp_0',['batch_index_select_dim0_cpu_host.cpp',['../batch__index__select__dim0__cpu__host_8cpp.html',1,'']]], + ['batch_5findex_5fselect_5fdim0_5fhost_2ecpp_1',['batch_index_select_dim0_host.cpp',['../batch__index__select__dim0__host_8cpp.html',1,'']]], + ['batched_5fdense_5fvec_5fjagged_5f2d_5fmul_5fbackward_2ecu_2',['batched_dense_vec_jagged_2d_mul_backward.cu',['../batched__dense__vec__jagged__2d__mul__backward_8cu.html',1,'']]], + ['batched_5fdense_5fvec_5fjagged_5f2d_5fmul_5fforward_2ecu_3',['batched_dense_vec_jagged_2d_mul_forward.cu',['../batched__dense__vec__jagged__2d__mul__forward_8cu.html',1,'']]], + ['bench_5futils_2ecuh_4',['bench_utils.cuh',['../bench__utils_8cuh.html',1,'']]] +]; diff --git a/search/files_1.js b/search/files_1.js new file mode 100644 index 000000000..268f60745 --- /dev/null +++ b/search/files_1.js @@ -0,0 +1,13 @@ +var searchData= +[ + ['cmakeccompilerid_2ec_0',['CMakeCCompilerId.c',['../_c_make_c_compiler_id_8c.html',1,'']]], + ['cmakecxxcompilerid_2ecpp_1',['CMakeCXXCompilerId.cpp',['../_c_make_c_x_x_compiler_id_8cpp.html',1,'']]], + ['common_2ecuh_2',['common.cuh',['../jagged__tensor__ops_2common_8cuh.html',1,'(Global Namespace)'],['../memory__utils_2common_8cuh.html',1,'(Global Namespace)'],['../quantize__ops_2common_8cuh.html',1,'(Global Namespace)'],['../sparse__ops_2common_8cuh.html',1,'(Global Namespace)'],['../split__embeddings__cache_2common_8cuh.html',1,'(Global Namespace)']]], + ['common_2eh_3',['common.h',['../memory__utils_2common_8h.html',1,'(Global Namespace)'],['../split__embeddings__cache_2common_8h.html',1,'(Global Namespace)']]], + ['cpu_5fkernel_5ftest_2ecpp_4',['cpu_kernel_test.cpp',['../cpu__kernel__test_8cpp.html',1,'']]], + ['cpu_5futils_2eh_5',['cpu_utils.h',['../cpu__utils_8h.html',1,'']]], + ['cub_5fnamespace_5fpostfix_2ecuh_6',['cub_namespace_postfix.cuh',['../cub__namespace__postfix_8cuh.html',1,'']]], + ['cub_5fnamespace_5fprefix_2ecuh_7',['cub_namespace_prefix.cuh',['../cub__namespace__prefix_8cuh.html',1,'']]], + ['cuda_5futils_2ecuh_8',['cuda_utils.cuh',['../cuda__utils_8cuh.html',1,'']]], + ['cumem_5futils_2eh_9',['cumem_utils.h',['../cumem__utils_8h.html',1,'']]] +]; diff --git a/search/files_10.js b/search/files_10.js new file mode 100644 index 000000000..86ea8cc52 --- /dev/null +++ b/search/files_10.js @@ -0,0 +1,40 @@ +var searchData= +[ + ['sparse_5fasync_5fcumsum_2ecu_0',['sparse_async_cumsum.cu',['../sparse__async__cumsum_8cu.html',1,'']]], + ['sparse_5fbatched_5funary_5fembeddings_2ecu_1',['sparse_batched_unary_embeddings.cu',['../sparse__batched__unary__embeddings_8cu.html',1,'']]], + ['sparse_5fblock_5fbucketize_5ffeatures_2ecu_2',['sparse_block_bucketize_features.cu',['../sparse__block__bucketize__features_8cu.html',1,'']]], + ['sparse_5fbucketize_5ffeatures_2ecu_3',['sparse_bucketize_features.cu',['../sparse__bucketize__features_8cu.html',1,'']]], + ['sparse_5fcompute_5ffrequency_5fsequence_2ecu_4',['sparse_compute_frequency_sequence.cu',['../sparse__compute__frequency__sequence_8cu.html',1,'']]], + ['sparse_5fexpand_5finto_5fjagged_5fpermute_2ecu_5',['sparse_expand_into_jagged_permute.cu',['../sparse__expand__into__jagged__permute_8cu.html',1,'']]], + ['sparse_5fgroup_5findex_2ecu_6',['sparse_group_index.cu',['../sparse__group__index_8cu.html',1,'']]], + ['sparse_5findex_5fadd_2ecu_7',['sparse_index_add.cu',['../sparse__index__add_8cu.html',1,'']]], + ['sparse_5findex_5fselect_2ecu_8',['sparse_index_select.cu',['../sparse__index__select_8cu.html',1,'']]], + ['sparse_5finvert_5fpermute_2ecu_9',['sparse_invert_permute.cu',['../sparse__invert__permute_8cu.html',1,'']]], + ['sparse_5fops_2ecuh_10',['sparse_ops.cuh',['../sparse__ops_8cuh.html',1,'']]], + ['sparse_5fops_2eh_11',['sparse_ops.h',['../sparse__ops_8h.html',1,'']]], + ['sparse_5fops_5fcpu_2ecpp_12',['sparse_ops_cpu.cpp',['../sparse__ops__cpu_8cpp.html',1,'']]], + ['sparse_5fops_5fgpu_2ecpp_13',['sparse_ops_gpu.cpp',['../sparse__ops__gpu_8cpp.html',1,'']]], + ['sparse_5fops_5fmeta_2ecpp_14',['sparse_ops_meta.cpp',['../sparse__ops__meta_8cpp.html',1,'']]], + ['sparse_5fops_5futils_2eh_15',['sparse_ops_utils.h',['../sparse__ops__utils_8h.html',1,'']]], + ['sparse_5fops_5futils_5ftest_2ecpp_16',['sparse_ops_utils_test.cpp',['../sparse__ops__utils__test_8cpp.html',1,'']]], + ['sparse_5fpack_5fsegments_5fbackward_2ecu_17',['sparse_pack_segments_backward.cu',['../sparse__pack__segments__backward_8cu.html',1,'']]], + ['sparse_5fpack_5fsegments_5fforward_2ecu_18',['sparse_pack_segments_forward.cu',['../sparse__pack__segments__forward_8cu.html',1,'']]], + ['sparse_5fpermute102_2ecu_19',['sparse_permute102.cu',['../sparse__permute102_8cu.html',1,'']]], + ['sparse_5fpermute_5f1d_2ecu_20',['sparse_permute_1d.cu',['../sparse__permute__1d_8cu.html',1,'']]], + ['sparse_5fpermute_5f2d_2ecu_21',['sparse_permute_2d.cu',['../sparse__permute__2d_8cu.html',1,'']]], + ['sparse_5fpermute_5fembeddings_2ecu_22',['sparse_permute_embeddings.cu',['../sparse__permute__embeddings_8cu.html',1,'']]], + ['sparse_5frange_2ecu_23',['sparse_range.cu',['../sparse__range_8cu.html',1,'']]], + ['sparse_5freorder_5fbatched_5fad_2ecu_24',['sparse_reorder_batched_ad.cu',['../sparse__reorder__batched__ad_8cu.html',1,'']]], + ['sparse_5fsegment_5fsum_5fcsr_2ecu_25',['sparse_segment_sum_csr.cu',['../sparse__segment__sum__csr_8cu.html',1,'']]], + ['sparse_5fzipf_2ecu_26',['sparse_zipf.cu',['../sparse__zipf_8cu.html',1,'']]], + ['split_5fembeddings_5fcache_5fcuda_2ecuh_27',['split_embeddings_cache_cuda.cuh',['../split__embeddings__cache__cuda_8cuh.html',1,'']]], + ['split_5fembeddings_5fcache_5fops_2ecpp_28',['split_embeddings_cache_ops.cpp',['../split__embeddings__cache__ops_8cpp.html',1,'']]], + ['split_5fembeddings_5fcache_5fops_2ecu_29',['split_embeddings_cache_ops.cu',['../split__embeddings__cache__ops_8cu.html',1,'']]], + ['split_5fembeddings_5futils_2ecpp_30',['split_embeddings_utils.cpp',['../split__embeddings__utils_8cpp.html',1,'']]], + ['split_5fembeddings_5futils_2ecuh_31',['split_embeddings_utils.cuh',['../split__embeddings__utils_8cuh.html',1,'']]], + ['ssd_5fsplit_5fembeddings_5fcache_5fcuda_2ecu_32',['ssd_split_embeddings_cache_cuda.cu',['../ssd__split__embeddings__cache__cuda_8cu.html',1,'']]], + ['ssd_5fsplit_5ftable_5fbatched_5fembeddings_2ecpp_33',['ssd_split_table_batched_embeddings.cpp',['../ssd__split__table__batched__embeddings_8cpp.html',1,'']]], + ['ssd_5ftable_5fbatched_5fembeddings_2eh_34',['ssd_table_batched_embeddings.h',['../ssd__table__batched__embeddings_8h.html',1,'']]], + ['stacked_5fjagged_5f1d_5fto_5fdense_2ecu_35',['stacked_jagged_1d_to_dense.cu',['../stacked__jagged__1d__to__dense_8cu.html',1,'']]], + ['stacked_5fjagged_5f2d_5fto_5fdense_2ecu_36',['stacked_jagged_2d_to_dense.cu',['../stacked__jagged__2d__to__dense_8cu.html',1,'']]] +]; diff --git a/search/files_11.js b/search/files_11.js new file mode 100644 index 000000000..0adefda48 --- /dev/null +++ b/search/files_11.js @@ -0,0 +1,7 @@ +var searchData= +[ + ['tensor_5fassert_5ftest_2ecpp_0',['tensor_assert_test.cpp',['../tensor__assert__test_8cpp.html',1,'']]], + ['topology_5futils_2ecpp_1',['topology_utils.cpp',['../topology__utils_8cpp.html',1,'']]], + ['topology_5futils_2eh_2',['topology_utils.h',['../topology__utils_8h.html',1,'']]], + ['transpose_5fembedding_5finput_2ecu_3',['transpose_embedding_input.cu',['../transpose__embedding__input_8cu.html',1,'']]] +]; diff --git a/search/files_12.js b/search/files_12.js new file mode 100644 index 000000000..d4e4f3005 --- /dev/null +++ b/search/files_12.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['uvm_5fcache_5fmiss_5femulate_5ftest_2ecpp_0',['uvm_cache_miss_emulate_test.cpp',['../uvm__cache__miss__emulate__test_8cpp.html',1,'']]] +]; diff --git a/search/files_13.js b/search/files_13.js new file mode 100644 index 000000000..3f5f262a7 --- /dev/null +++ b/search/files_13.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['verify_5ffp16_5fstochastic_5fbenchmark_2ecu_0',['verify_fp16_stochastic_benchmark.cu',['../verify__fp16__stochastic__benchmark_8cu.html',1,'']]] +]; diff --git a/search/files_2.js b/search/files_2.js new file mode 100644 index 000000000..473d42e4b --- /dev/null +++ b/search/files_2.js @@ -0,0 +1,5 @@ +var searchData= +[ + ['dense_5fto_5fjagged_5fforward_2ecu_0',['dense_to_jagged_forward.cu',['../dense__to__jagged__forward_8cu.html',1,'']]], + ['dispatch_5fmacros_2eh_1',['dispatch_macros.h',['../dispatch__macros_8h.html',1,'']]] +]; diff --git a/search/files_3.js b/search/files_3.js new file mode 100644 index 000000000..1b5e55fb8 --- /dev/null +++ b/search/files_3.js @@ -0,0 +1,45 @@ +var searchData= +[ + ['embedding_5fbackward_5fdense_5fhost_2ecpp_0',['embedding_backward_dense_host.cpp',['../embedding__backward__dense__host_8cpp.html',1,'']]], + ['embedding_5fbackward_5fdense_5fhost_5fcpu_2ecpp_1',['embedding_backward_dense_host_cpu.cpp',['../embedding__backward__dense__host__cpu_8cpp.html',1,'']]], + ['embedding_5fbackward_5fsplit_5fcpu_5fapprox_5ftemplate_2ecpp_2',['embedding_backward_split_cpu_approx_template.cpp',['../embedding__backward__split__cpu__approx__template_8cpp.html',1,'']]], + ['embedding_5fbackward_5fsplit_5fcpu_5ftemplate_2ecpp_3',['embedding_backward_split_cpu_template.cpp',['../embedding__backward__split__cpu__template_8cpp.html',1,'']]], + ['embedding_5fbackward_5fsplit_5fgrad_5ftemplate_2ecu_4',['embedding_backward_split_grad_template.cu',['../embedding__backward__split__grad__template_8cu.html',1,'']]], + ['embedding_5fbackward_5fsplit_5fhost_5fcpu_5ftemplate_2ecpp_5',['embedding_backward_split_host_cpu_template.cpp',['../embedding__backward__split__host__cpu__template_8cpp.html',1,'']]], + ['embedding_5fbackward_5fsplit_5fhost_5ftemplate_2ecpp_6',['embedding_backward_split_host_template.cpp',['../embedding__backward__split__host__template_8cpp.html',1,'']]], + ['embedding_5fbackward_5fsplit_5findice_5fweights_5ftemplate_2ecu_7',['embedding_backward_split_indice_weights_template.cu',['../embedding__backward__split__indice__weights__template_8cu.html',1,'']]], + ['embedding_5fbackward_5fsplit_5fkernel_5fcta_5ftemplate_2ecu_8',['embedding_backward_split_kernel_cta_template.cu',['../embedding__backward__split__kernel__cta__template_8cu.html',1,'']]], + ['embedding_5fbackward_5fsplit_5fkernel_5fwarp_5ftemplate_2ecu_9',['embedding_backward_split_kernel_warp_template.cu',['../embedding__backward__split__kernel__warp__template_8cu.html',1,'']]], + ['embedding_5fbackward_5fsplit_5ftemplate_2ecu_10',['embedding_backward_split_template.cu',['../embedding__backward__split__template_8cu.html',1,'']]], + ['embedding_5fbackward_5ftemplate_5fhelpers_2ecuh_11',['embedding_backward_template_helpers.cuh',['../embedding__backward__template__helpers_8cuh.html',1,'']]], + ['embedding_5fbounds_5fcheck_2ecu_12',['embedding_bounds_check.cu',['../embedding__bounds__check_8cu.html',1,'']]], + ['embedding_5fbounds_5fcheck_5fhost_2ecpp_13',['embedding_bounds_check_host.cpp',['../embedding__bounds__check__host_8cpp.html',1,'']]], + ['embedding_5fbounds_5fcheck_5fhost_5fcpu_2ecpp_14',['embedding_bounds_check_host_cpu.cpp',['../embedding__bounds__check__host__cpu_8cpp.html',1,'']]], + ['embedding_5fcommon_2eh_15',['embedding_common.h',['../embedding__common_8h.html',1,'']]], + ['embedding_5fforward_5fquantized_5fcpu_5ftemplate_2ecpp_16',['embedding_forward_quantized_cpu_template.cpp',['../embedding__forward__quantized__cpu__template_8cpp.html',1,'']]], + ['embedding_5fforward_5fquantized_5fhost_2ecpp_17',['embedding_forward_quantized_host.cpp',['../embedding__forward__quantized__host_8cpp.html',1,'']]], + ['embedding_5fforward_5fquantized_5fhost_5fcpu_2ecpp_18',['embedding_forward_quantized_host_cpu.cpp',['../embedding__forward__quantized__host__cpu_8cpp.html',1,'']]], + ['embedding_5fforward_5fquantized_5fsplit_5flookup_2ecu_19',['embedding_forward_quantized_split_lookup.cu',['../embedding__forward__quantized__split__lookup_8cu.html',1,'']]], + ['embedding_5fforward_5fquantized_5fsplit_5fnbit_5fhost_5ftemplate_2ecu_20',['embedding_forward_quantized_split_nbit_host_template.cu',['../embedding__forward__quantized__split__nbit__host__template_8cu.html',1,'']]], + ['embedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5ftemplate_2ecu_21',['embedding_forward_quantized_split_nbit_kernel_template.cu',['../embedding__forward__quantized__split__nbit__kernel__template_8cu.html',1,'']]], + ['embedding_5fforward_5fsplit_5fcpu_2ecpp_22',['embedding_forward_split_cpu.cpp',['../embedding__forward__split__cpu_8cpp.html',1,'']]], + ['embedding_5fforward_5fsplit_5fcpu_2eh_23',['embedding_forward_split_cpu.h',['../embedding__forward__split__cpu_8h.html',1,'']]], + ['embedding_5fforward_5fsplit_5fkernel_5fnobag_5fsmall_5ftemplate_2ecu_24',['embedding_forward_split_kernel_nobag_small_template.cu',['../embedding__forward__split__kernel__nobag__small__template_8cu.html',1,'']]], + ['embedding_5fforward_5fsplit_5fkernel_5ftemplate_2ecu_25',['embedding_forward_split_kernel_template.cu',['../embedding__forward__split__kernel__template_8cu.html',1,'']]], + ['embedding_5fforward_5fsplit_5fkernel_5fv2_5ftemplate_2ecu_26',['embedding_forward_split_kernel_v2_template.cu',['../embedding__forward__split__kernel__v2__template_8cu.html',1,'']]], + ['embedding_5fforward_5fsplit_5fmeta_5ftemplate_2ecpp_27',['embedding_forward_split_meta_template.cpp',['../embedding__forward__split__meta__template_8cpp.html',1,'']]], + ['embedding_5fforward_5fsplit_5ftemplate_2ecu_28',['embedding_forward_split_template.cu',['../embedding__forward__split__template_8cu.html',1,'']]], + ['embedding_5fforward_5ftemplate_5fhelpers_2ecuh_29',['embedding_forward_template_helpers.cuh',['../embedding__forward__template__helpers_8cuh.html',1,'']]], + ['embedding_5finplace_5fupdate_2ecu_30',['embedding_inplace_update.cu',['../embedding__inplace__update_8cu.html',1,'']]], + ['embedding_5finplace_5fupdate_2eh_31',['embedding_inplace_update.h',['../embedding__inplace__update_8h.html',1,'']]], + ['embedding_5finplace_5fupdate_5fcpu_2ecpp_32',['embedding_inplace_update_cpu.cpp',['../embedding__inplace__update__cpu_8cpp.html',1,'']]], + ['embedding_5finplace_5fupdate_5fgpu_2ecpp_33',['embedding_inplace_update_gpu.cpp',['../embedding__inplace__update__gpu_8cpp.html',1,'']]], + ['embedding_5finplace_5fupdate_5ftest_2ecpp_34',['embedding_inplace_update_test.cpp',['../embedding__inplace__update__test_8cpp.html',1,'']]], + ['embedding_5fop_5fregistration_2eh_35',['embedding_op_registration.h',['../embedding__op__registration_8h.html',1,'']]], + ['embedding_5fops_5fplaceholder_2ecpp_36',['embedding_ops_placeholder.cpp',['../embedding__ops__placeholder_8cpp.html',1,'']]], + ['embedding_5foptimizer_5fsplit_5fdevice_5fkernel_5ftemplate_2ecuh_37',['embedding_optimizer_split_device_kernel_template.cuh',['../embedding__optimizer__split__device__kernel__template_8cuh.html',1,'']]], + ['embedding_5foptimizer_5fsplit_5fhost_5ftemplate_2ecpp_38',['embedding_optimizer_split_host_template.cpp',['../embedding__optimizer__split__host__template_8cpp.html',1,'']]], + ['embedding_5foptimizer_5fsplit_5fkernel_5ftemplate_2ecu_39',['embedding_optimizer_split_kernel_template.cu',['../embedding__optimizer__split__kernel__template_8cu.html',1,'']]], + ['embedding_5foptimizer_5fsplit_5ftemplate_2ecu_40',['embedding_optimizer_split_template.cu',['../embedding__optimizer__split__template_8cu.html',1,'']]], + ['enum_5futils_2eh_41',['enum_utils.h',['../enum__utils_8h.html',1,'']]] +]; diff --git a/search/files_4.js b/search/files_4.js new file mode 100644 index 000000000..2515308e0 --- /dev/null +++ b/search/files_4.js @@ -0,0 +1,5 @@ +var searchData= +[ + ['fbgemm_5fcuda_5futils_2ecuh_0',['fbgemm_cuda_utils.cuh',['../fbgemm__cuda__utils_8cuh.html',1,'']]], + ['fbgemm_5ftensor_5faccessor_2eh_1',['fbgemm_tensor_accessor.h',['../fbgemm__tensor__accessor_8h.html',1,'']]] +]; diff --git a/search/files_5.js b/search/files_5.js new file mode 100644 index 000000000..085227b01 --- /dev/null +++ b/search/files_5.js @@ -0,0 +1,257 @@ +var searchData= +[ + ['gen_5fbatch_5findex_5fselect_5fdim0_5fbackward_5fcodegen_5fcuda_2ecu_0',['gen_batch_index_select_dim0_backward_codegen_cuda.cu',['../gen__batch__index__select__dim0__backward__codegen__cuda_8cu.html',1,'']]], + ['gen_5fbatch_5findex_5fselect_5fdim0_5fbackward_5fkernel_5fcta_2ecu_1',['gen_batch_index_select_dim0_backward_kernel_cta.cu',['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html',1,'']]], + ['gen_5fbatch_5findex_5fselect_5fdim0_5fbackward_5fkernel_5fwarp_2ecu_2',['gen_batch_index_select_dim0_backward_kernel_warp.cu',['../gen__batch__index__select__dim0__backward__kernel__warp_8cu.html',1,'']]], + ['gen_5fbatch_5findex_5fselect_5fdim0_5fforward_5fcodegen_5fcuda_2ecu_3',['gen_batch_index_select_dim0_forward_codegen_cuda.cu',['../gen__batch__index__select__dim0__forward__codegen__cuda_8cu.html',1,'']]], + ['gen_5fbatch_5findex_5fselect_5fdim0_5fforward_5fkernel_2ecu_4',['gen_batch_index_select_dim0_forward_kernel.cu',['../gen__batch__index__select__dim0__forward__kernel_8cu.html',1,'']]], + ['gen_5fbatch_5findex_5fselect_5fdim0_5fforward_5fkernel_5fsmall_2ecu_5',['gen_batch_index_select_dim0_forward_kernel_small.cu',['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5fcpu_2ecpp_6',['gen_embedding_backward_adagrad_split_cpu.cpp',['../gen__embedding__backward__adagrad__split__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5funweighted_5fcuda_2ecu_7',['gen_embedding_backward_adagrad_split_unweighted_cuda.cu',['../gen__embedding__backward__adagrad__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5funweighted_5fkernel_5fcta_2ecu_8',['gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_9',['gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_10',['gen_embedding_backward_adagrad_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__adagrad__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_11',['gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_12',['gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5fweighted_5fcuda_2ecu_13',['gen_embedding_backward_adagrad_split_weighted_cuda.cu',['../gen__embedding__backward__adagrad__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5fweighted_5fkernel_5fcta_2ecu_14',['gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu',['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadagrad_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_15',['gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu',['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadam_5fsplit_5funweighted_5fcuda_2ecu_16',['gen_embedding_backward_adam_split_unweighted_cuda.cu',['../gen__embedding__backward__adam__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadam_5fsplit_5funweighted_5fkernel_5fcta_2ecu_17',['gen_embedding_backward_adam_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadam_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_18',['gen_embedding_backward_adam_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadam_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_19',['gen_embedding_backward_adam_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__adam__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadam_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_20',['gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadam_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_21',['gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadam_5fsplit_5fweighted_5fcuda_2ecu_22',['gen_embedding_backward_adam_split_weighted_cuda.cu',['../gen__embedding__backward__adam__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadam_5fsplit_5fweighted_5fkernel_5fcta_2ecu_23',['gen_embedding_backward_adam_split_weighted_kernel_cta.cu',['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fadam_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_24',['gen_embedding_backward_adam_split_weighted_kernel_warp.cu',['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fcuda_2ecu_25',['gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fkernel_5fcta_2ecu_26',['gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_27',['gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_28',['gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_29',['gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_30',['gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5fweighted_5fcuda_2ecu_31',['gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5fweighted_5fkernel_5fcta_2ecu_32',['gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_33',['gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5findice_5fweights_5fcodegen_5fcuda_2ecu_34',['gen_embedding_backward_dense_indice_weights_codegen_cuda.cu',['../gen__embedding__backward__dense__indice__weights__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5fcpu_2ecpp_35',['gen_embedding_backward_dense_split_cpu.cpp',['../gen__embedding__backward__dense__split__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5funweighted_5fcuda_2ecu_36',['gen_embedding_backward_dense_split_unweighted_cuda.cu',['../gen__embedding__backward__dense__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5funweighted_5fkernel_5fcta_2ecu_37',['gen_embedding_backward_dense_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_38',['gen_embedding_backward_dense_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__dense__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_39',['gen_embedding_backward_dense_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__dense__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_40',['gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_41',['gen_embedding_backward_dense_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5fweighted_5fcuda_2ecu_42',['gen_embedding_backward_dense_split_weighted_cuda.cu',['../gen__embedding__backward__dense__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5fweighted_5fkernel_5fcta_2ecu_43',['gen_embedding_backward_dense_split_weighted_kernel_cta.cu',['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fdense_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_44',['gen_embedding_backward_dense_split_weighted_kernel_warp.cu',['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flamb_5fsplit_5funweighted_5fcuda_2ecu_45',['gen_embedding_backward_lamb_split_unweighted_cuda.cu',['../gen__embedding__backward__lamb__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flamb_5fsplit_5funweighted_5fkernel_5fcta_2ecu_46',['gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flamb_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_47',['gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flamb_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_48',['gen_embedding_backward_lamb_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__lamb__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flamb_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_49',['gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flamb_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_50',['gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flamb_5fsplit_5fweighted_5fcuda_2ecu_51',['gen_embedding_backward_lamb_split_weighted_cuda.cu',['../gen__embedding__backward__lamb__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flamb_5fsplit_5fweighted_5fkernel_5fcta_2ecu_52',['gen_embedding_backward_lamb_split_weighted_kernel_cta.cu',['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flamb_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_53',['gen_embedding_backward_lamb_split_weighted_kernel_warp.cu',['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flars_5fsgd_5fsplit_5funweighted_5fcuda_2ecu_54',['gen_embedding_backward_lars_sgd_split_unweighted_cuda.cu',['../gen__embedding__backward__lars__sgd__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flars_5fsgd_5fsplit_5funweighted_5fkernel_5fcta_2ecu_55',['gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flars_5fsgd_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_56',['gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flars_5fsgd_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_57',['gen_embedding_backward_lars_sgd_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flars_5fsgd_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_58',['gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flars_5fsgd_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_59',['gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flars_5fsgd_5fsplit_5fweighted_5fcuda_2ecu_60',['gen_embedding_backward_lars_sgd_split_weighted_cuda.cu',['../gen__embedding__backward__lars__sgd__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flars_5fsgd_5fsplit_5fweighted_5fkernel_5fcta_2ecu_61',['gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu',['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5flars_5fsgd_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_62',['gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu',['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fnone_5fsplit_5funweighted_5fcuda_2ecu_63',['gen_embedding_backward_none_split_unweighted_cuda.cu',['../gen__embedding__backward__none__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fnone_5fsplit_5funweighted_5fkernel_5fcta_2ecu_64',['gen_embedding_backward_none_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fnone_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_65',['gen_embedding_backward_none_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__none__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fnone_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_66',['gen_embedding_backward_none_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__none__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fnone_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_67',['gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fnone_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_68',['gen_embedding_backward_none_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__none__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fnone_5fsplit_5fweighted_5fcuda_2ecu_69',['gen_embedding_backward_none_split_weighted_cuda.cu',['../gen__embedding__backward__none__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fnone_5fsplit_5fweighted_5fkernel_5fcta_2ecu_70',['gen_embedding_backward_none_split_weighted_kernel_cta.cu',['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fnone_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_71',['gen_embedding_backward_none_split_weighted_kernel_warp.cu',['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5fadam_5fsplit_5funweighted_5fcuda_2ecu_72',['gen_embedding_backward_partial_rowwise_adam_split_unweighted_cuda.cu',['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5fadam_5fsplit_5funweighted_5fkernel_5fcta_2ecu_73',['gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5fadam_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_74',['gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5fadam_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_75',['gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5fadam_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_76',['gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5fadam_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_77',['gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5fadam_5fsplit_5fweighted_5fcuda_2ecu_78',['gen_embedding_backward_partial_rowwise_adam_split_weighted_cuda.cu',['../gen__embedding__backward__partial__rowwise__adam__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5fadam_5fsplit_5fweighted_5fkernel_5fcta_2ecu_79',['gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu',['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5fadam_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_80',['gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu',['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5flamb_5fsplit_5funweighted_5fcuda_2ecu_81',['gen_embedding_backward_partial_rowwise_lamb_split_unweighted_cuda.cu',['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5flamb_5fsplit_5funweighted_5fkernel_5fcta_2ecu_82',['gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5flamb_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_83',['gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5flamb_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_84',['gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5flamb_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_85',['gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5flamb_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_86',['gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5flamb_5fsplit_5fweighted_5fcuda_2ecu_87',['gen_embedding_backward_partial_rowwise_lamb_split_weighted_cuda.cu',['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5flamb_5fsplit_5fweighted_5fkernel_5fcta_2ecu_88',['gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu',['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fpartial_5frowwise_5flamb_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_89',['gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu',['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5fcpu_2ecpp_90',['gen_embedding_backward_rowwise_adagrad_split_cpu.cpp',['../gen__embedding__backward__rowwise__adagrad__split__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5funweighted_5fcuda_2ecu_91',['gen_embedding_backward_rowwise_adagrad_split_unweighted_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5funweighted_5fkernel_5fcta_2ecu_92',['gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_93',['gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_94',['gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_95',['gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_96',['gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5funweighted_5fvbe_5fcuda_2ecu_97',['gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5funweighted_5fvbe_5fkernel_5fcta_2ecu_98',['gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5funweighted_5fvbe_5fkernel_5fwarp_2ecu_99',['gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5fweighted_5fcuda_2ecu_100',['gen_embedding_backward_rowwise_adagrad_split_weighted_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5fweighted_5fkernel_5fcta_2ecu_101',['gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_102',['gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5fweighted_5fvbe_5fcuda_2ecu_103',['gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5fweighted_5fvbe_5fkernel_5fcta_2ecu_104',['gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fsplit_5fweighted_5fvbe_5fkernel_5fwarp_2ecu_105',['gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5fcpu_2ecpp_106',['gen_embedding_backward_rowwise_adagrad_with_counter_split_cpu.cpp',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5funweighted_5fcuda_2ecu_107',['gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5funweighted_5fkernel_5fcta_2ecu_108',['gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_109',['gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_110',['gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_111',['gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_112',['gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5fweighted_5fcuda_2ecu_113',['gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5fweighted_5fkernel_5fcta_2ecu_114',['gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_115',['gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fcuda_2ecu_116',['gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fkernel_5fcta_2ecu_117',['gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_118',['gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_119',['gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_120',['gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_121',['gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5fweighted_5fcuda_2ecu_122',['gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5fweighted_5fkernel_5fcta_2ecu_123',['gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_124',['gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5fcpu_2ecpp_125',['gen_embedding_backward_rowwise_weighted_adagrad_split_cpu.cpp',['../gen__embedding__backward__rowwise__weighted__adagrad__split__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5funweighted_5fcuda_2ecu_126',['gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_cuda.cu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5funweighted_5fkernel_5fcta_2ecu_127',['gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_128',['gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_129',['gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_130',['gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_131',['gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5fweighted_5fcuda_2ecu_132',['gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_cuda.cu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5fweighted_5fkernel_5fcta_2ecu_133',['gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5frowwise_5fweighted_5fadagrad_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_134',['gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5fcpu_2ecpp_135',['gen_embedding_backward_sgd_split_cpu.cpp',['../gen__embedding__backward__sgd__split__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5funweighted_5fcuda_2ecu_136',['gen_embedding_backward_sgd_split_unweighted_cuda.cu',['../gen__embedding__backward__sgd__split__unweighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5funweighted_5fkernel_5fcta_2ecu_137',['gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu',['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5funweighted_5fkernel_5fwarp_2ecu_138',['gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu',['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5funweighted_5fnobag_5fcuda_2ecu_139',['gen_embedding_backward_sgd_split_unweighted_nobag_cuda.cu',['../gen__embedding__backward__sgd__split__unweighted__nobag__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5funweighted_5fnobag_5fkernel_5fcta_2ecu_140',['gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu',['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5funweighted_5fnobag_5fkernel_5fwarp_2ecu_141',['gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu',['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5funweighted_5fvbe_5fcuda_2ecu_142',['gen_embedding_backward_sgd_split_unweighted_vbe_cuda.cu',['../gen__embedding__backward__sgd__split__unweighted__vbe__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5funweighted_5fvbe_5fkernel_5fcta_2ecu_143',['gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu',['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5funweighted_5fvbe_5fkernel_5fwarp_2ecu_144',['gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu',['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5fweighted_5fcuda_2ecu_145',['gen_embedding_backward_sgd_split_weighted_cuda.cu',['../gen__embedding__backward__sgd__split__weighted__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5fweighted_5fkernel_5fcta_2ecu_146',['gen_embedding_backward_sgd_split_weighted_kernel_cta.cu',['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5fweighted_5fkernel_5fwarp_2ecu_147',['gen_embedding_backward_sgd_split_weighted_kernel_warp.cu',['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5fweighted_5fvbe_5fcuda_2ecu_148',['gen_embedding_backward_sgd_split_weighted_vbe_cuda.cu',['../gen__embedding__backward__sgd__split__weighted__vbe__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5fweighted_5fvbe_5fkernel_5fcta_2ecu_149',['gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu',['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsgd_5fsplit_5fweighted_5fvbe_5fkernel_5fwarp_2ecu_150',['gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu',['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fadagrad_2ecpp_151',['gen_embedding_backward_split_adagrad.cpp',['../gen__embedding__backward__split__adagrad_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fadagrad_5fcpu_2ecpp_152',['gen_embedding_backward_split_adagrad_cpu.cpp',['../gen__embedding__backward__split__adagrad__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fadam_2ecpp_153',['gen_embedding_backward_split_adam.cpp',['../gen__embedding__backward__split__adam_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fadam_5fcpu_2ecpp_154',['gen_embedding_backward_split_adam_cpu.cpp',['../gen__embedding__backward__split__adam__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fapprox_5frowwise_5fadagrad_2ecpp_155',['gen_embedding_backward_split_approx_rowwise_adagrad.cpp',['../gen__embedding__backward__split__approx__rowwise__adagrad_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fapprox_5frowwise_5fadagrad_5fcpu_2ecpp_156',['gen_embedding_backward_split_approx_rowwise_adagrad_cpu.cpp',['../gen__embedding__backward__split__approx__rowwise__adagrad__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fapprox_5frowwise_5fadagrad_5fwith_5fcounter_2ecpp_157',['gen_embedding_backward_split_approx_rowwise_adagrad_with_counter.cpp',['../gen__embedding__backward__split__approx__rowwise__adagrad__with__counter_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fapprox_5frowwise_5fadagrad_5fwith_5fcounter_5fcpu_2ecpp_158',['gen_embedding_backward_split_approx_rowwise_adagrad_with_counter_cpu.cpp',['../gen__embedding__backward__split__approx__rowwise__adagrad__with__counter__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_2ecpp_159',['gen_embedding_backward_split_approx_rowwise_adagrad_with_weight_decay.cpp',['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fcpu_2ecpp_160',['gen_embedding_backward_split_approx_rowwise_adagrad_with_weight_decay_cpu.cpp',['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fapprox_5fsgd_2ecpp_161',['gen_embedding_backward_split_approx_sgd.cpp',['../gen__embedding__backward__split__approx__sgd_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fapprox_5fsgd_5fcpu_2ecpp_162',['gen_embedding_backward_split_approx_sgd_cpu.cpp',['../gen__embedding__backward__split__approx__sgd__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fgrad_2ecu_163',['gen_embedding_backward_split_grad.cu',['../gen__embedding__backward__split__grad_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5findice_5fweights_5fcodegen_5fcuda_2ecu_164',['gen_embedding_backward_split_indice_weights_codegen_cuda.cu',['../gen__embedding__backward__split__indice__weights__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5flamb_2ecpp_165',['gen_embedding_backward_split_lamb.cpp',['../gen__embedding__backward__split__lamb_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5flamb_5fcpu_2ecpp_166',['gen_embedding_backward_split_lamb_cpu.cpp',['../gen__embedding__backward__split__lamb__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5flars_5fsgd_2ecpp_167',['gen_embedding_backward_split_lars_sgd.cpp',['../gen__embedding__backward__split__lars__sgd_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5flars_5fsgd_5fcpu_2ecpp_168',['gen_embedding_backward_split_lars_sgd_cpu.cpp',['../gen__embedding__backward__split__lars__sgd__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fnone_2ecpp_169',['gen_embedding_backward_split_none.cpp',['../gen__embedding__backward__split__none_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fnone_5fcpu_2ecpp_170',['gen_embedding_backward_split_none_cpu.cpp',['../gen__embedding__backward__split__none__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fpartial_5frowwise_5fadam_2ecpp_171',['gen_embedding_backward_split_partial_rowwise_adam.cpp',['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fpartial_5frowwise_5fadam_5fcpu_2ecpp_172',['gen_embedding_backward_split_partial_rowwise_adam_cpu.cpp',['../gen__embedding__backward__split__partial__rowwise__adam__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fpartial_5frowwise_5flamb_2ecpp_173',['gen_embedding_backward_split_partial_rowwise_lamb.cpp',['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fpartial_5frowwise_5flamb_5fcpu_2ecpp_174',['gen_embedding_backward_split_partial_rowwise_lamb_cpu.cpp',['../gen__embedding__backward__split__partial__rowwise__lamb__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5frowwise_5fadagrad_2ecpp_175',['gen_embedding_backward_split_rowwise_adagrad.cpp',['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5frowwise_5fadagrad_5fcpu_2ecpp_176',['gen_embedding_backward_split_rowwise_adagrad_cpu.cpp',['../gen__embedding__backward__split__rowwise__adagrad__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5frowwise_5fadagrad_5fwith_5fcounter_2ecpp_177',['gen_embedding_backward_split_rowwise_adagrad_with_counter.cpp',['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5frowwise_5fadagrad_5fwith_5fcounter_5fcpu_2ecpp_178',['gen_embedding_backward_split_rowwise_adagrad_with_counter_cpu.cpp',['../gen__embedding__backward__split__rowwise__adagrad__with__counter__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_2ecpp_179',['gen_embedding_backward_split_rowwise_adagrad_with_weight_decay.cpp',['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fcpu_2ecpp_180',['gen_embedding_backward_split_rowwise_adagrad_with_weight_decay_cpu.cpp',['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5frowwise_5fweighted_5fadagrad_2ecpp_181',['gen_embedding_backward_split_rowwise_weighted_adagrad.cpp',['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5frowwise_5fweighted_5fadagrad_5fcpu_2ecpp_182',['gen_embedding_backward_split_rowwise_weighted_adagrad_cpu.cpp',['../gen__embedding__backward__split__rowwise__weighted__adagrad__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fsgd_2ecpp_183',['gen_embedding_backward_split_sgd.cpp',['../gen__embedding__backward__split__sgd_8cpp.html',1,'']]], + ['gen_5fembedding_5fbackward_5fsplit_5fsgd_5fcpu_2ecpp_184',['gen_embedding_backward_split_sgd_cpu.cpp',['../gen__embedding__backward__split__sgd__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fforward_5fdense_5funweighted_5fcodegen_5fcuda_2ecu_185',['gen_embedding_forward_dense_unweighted_codegen_cuda.cu',['../gen__embedding__forward__dense__unweighted__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fdense_5funweighted_5fcodegen_5fmeta_2ecpp_186',['gen_embedding_forward_dense_unweighted_codegen_meta.cpp',['../gen__embedding__forward__dense__unweighted__codegen__meta_8cpp.html',1,'']]], + ['gen_5fembedding_5fforward_5fdense_5funweighted_5fkernel_2ecu_187',['gen_embedding_forward_dense_unweighted_kernel.cu',['../gen__embedding__forward__dense__unweighted__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fdense_5funweighted_5fnobag_5fkernel_2ecu_188',['gen_embedding_forward_dense_unweighted_nobag_kernel.cu',['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fdense_5funweighted_5fnobag_5fkernel_5fsmall_2ecu_189',['gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu',['../gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fdense_5fweighted_5fcodegen_5fcuda_2ecu_190',['gen_embedding_forward_dense_weighted_codegen_cuda.cu',['../gen__embedding__forward__dense__weighted__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fdense_5fweighted_5fcodegen_5fmeta_2ecpp_191',['gen_embedding_forward_dense_weighted_codegen_meta.cpp',['../gen__embedding__forward__dense__weighted__codegen__meta_8cpp.html',1,'']]], + ['gen_5fembedding_5fforward_5fdense_5fweighted_5fkernel_2ecu_192',['gen_embedding_forward_dense_weighted_kernel.cu',['../gen__embedding__forward__dense__weighted__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fhost_5funweighted_5fcodegen_5fcuda_2ecu_193',['gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fhost_5funweighted_5fnobag_5fcodegen_5fcuda_2ecu_194',['gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fhost_5fweighted_5fcodegen_5fcuda_2ecu_195',['gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5ffp16_5fcodegen_5fcuda_2ecu_196',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_fp16_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__fp16__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5ffp32_5fcodegen_5fcuda_2ecu_197',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_fp32_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__fp32__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5ffp8_5fcodegen_5fcuda_2ecu_198',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_fp8_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__fp8__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5fint2_5fcodegen_5fcuda_2ecu_199',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_int2_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__int2__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5fint4_5fcodegen_5fcuda_2ecu_200',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_int4_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__int4__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5fint8_5fcodegen_5fcuda_2ecu_201',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_int8_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__int8__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5fnobag_5ffp16_5fcodegen_5fcuda_2ecu_202',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_fp16_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__fp16__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5fnobag_5ffp32_5fcodegen_5fcuda_2ecu_203',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_fp32_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__fp32__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5fnobag_5ffp8_5fcodegen_5fcuda_2ecu_204',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_fp8_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__fp8__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5fnobag_5fint2_5fcodegen_5fcuda_2ecu_205',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_int2_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__int2__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5fnobag_5fint4_5fcodegen_5fcuda_2ecu_206',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_int4_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__int4__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5funweighted_5fnobag_5fint8_5fcodegen_5fcuda_2ecu_207',['gen_embedding_forward_quantized_split_nbit_kernel_unweighted_nobag_int8_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__unweighted__nobag__int8__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5fweighted_5ffp16_5fcodegen_5fcuda_2ecu_208',['gen_embedding_forward_quantized_split_nbit_kernel_weighted_fp16_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__weighted__fp16__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5fweighted_5ffp32_5fcodegen_5fcuda_2ecu_209',['gen_embedding_forward_quantized_split_nbit_kernel_weighted_fp32_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__weighted__fp32__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5fweighted_5ffp8_5fcodegen_5fcuda_2ecu_210',['gen_embedding_forward_quantized_split_nbit_kernel_weighted_fp8_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__weighted__fp8__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5fweighted_5fint2_5fcodegen_5fcuda_2ecu_211',['gen_embedding_forward_quantized_split_nbit_kernel_weighted_int2_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__weighted__int2__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5fweighted_5fint4_5fcodegen_5fcuda_2ecu_212',['gen_embedding_forward_quantized_split_nbit_kernel_weighted_int4_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__weighted__int4__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fsplit_5fnbit_5fkernel_5fweighted_5fint8_5fcodegen_5fcuda_2ecu_213',['gen_embedding_forward_quantized_split_nbit_kernel_weighted_int8_codegen_cuda.cu',['../gen__embedding__forward__quantized__split__nbit__kernel__weighted__int8__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5funweighted_5fcodegen_5fcpu_2ecpp_214',['gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp',['../gen__embedding__forward__quantized__unweighted__codegen__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fforward_5fquantized_5fweighted_5fcodegen_5fcpu_2ecpp_215',['gen_embedding_forward_quantized_weighted_codegen_cpu.cpp',['../gen__embedding__forward__quantized__weighted__codegen__cpu_8cpp.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5funweighted_5fcodegen_5fcuda_2ecu_216',['gen_embedding_forward_split_unweighted_codegen_cuda.cu',['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5funweighted_5fcodegen_5fmeta_2ecpp_217',['gen_embedding_forward_split_unweighted_codegen_meta.cpp',['../gen__embedding__forward__split__unweighted__codegen__meta_8cpp.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5funweighted_5fkernel_2ecu_218',['gen_embedding_forward_split_unweighted_kernel.cu',['../gen__embedding__forward__split__unweighted__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5funweighted_5fnobag_5fkernel_2ecu_219',['gen_embedding_forward_split_unweighted_nobag_kernel.cu',['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5funweighted_5fnobag_5fkernel_5fsmall_2ecu_220',['gen_embedding_forward_split_unweighted_nobag_kernel_small.cu',['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5funweighted_5fv2_5fkernel_2ecu_221',['gen_embedding_forward_split_unweighted_v2_kernel.cu',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5funweighted_5fvbe_5fcodegen_5fcuda_2ecu_222',['gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu',['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5funweighted_5fvbe_5fcodegen_5fmeta_2ecpp_223',['gen_embedding_forward_split_unweighted_vbe_codegen_meta.cpp',['../gen__embedding__forward__split__unweighted__vbe__codegen__meta_8cpp.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5funweighted_5fvbe_5fkernel_2ecu_224',['gen_embedding_forward_split_unweighted_vbe_kernel.cu',['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5fweighted_5fcodegen_5fcuda_2ecu_225',['gen_embedding_forward_split_weighted_codegen_cuda.cu',['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5fweighted_5fcodegen_5fmeta_2ecpp_226',['gen_embedding_forward_split_weighted_codegen_meta.cpp',['../gen__embedding__forward__split__weighted__codegen__meta_8cpp.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5fweighted_5fkernel_2ecu_227',['gen_embedding_forward_split_weighted_kernel.cu',['../gen__embedding__forward__split__weighted__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5fweighted_5fv2_5fkernel_2ecu_228',['gen_embedding_forward_split_weighted_v2_kernel.cu',['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5fweighted_5fvbe_5fcodegen_5fcuda_2ecu_229',['gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu',['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5fweighted_5fvbe_5fcodegen_5fmeta_2ecpp_230',['gen_embedding_forward_split_weighted_vbe_codegen_meta.cpp',['../gen__embedding__forward__split__weighted__vbe__codegen__meta_8cpp.html',1,'']]], + ['gen_5fembedding_5fforward_5fsplit_5fweighted_5fvbe_5fkernel_2ecu_231',['gen_embedding_forward_split_weighted_vbe_kernel.cu',['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fadagrad_5fsplit_5fdevice_5fkernel_2ecuh_232',['gen_embedding_optimizer_adagrad_split_device_kernel.cuh',['../gen__embedding__optimizer__adagrad__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fadam_5fsplit_5fdevice_5fkernel_2ecuh_233',['gen_embedding_optimizer_adam_split_device_kernel.cuh',['../gen__embedding__optimizer__adam__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fapprox_5frowwise_5fadagrad_5fsplit_5fdevice_5fkernel_2ecuh_234',['gen_embedding_optimizer_approx_rowwise_adagrad_split_device_kernel.cuh',['../gen__embedding__optimizer__approx__rowwise__adagrad__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fapprox_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5fdevice_5fkernel_2ecuh_235',['gen_embedding_optimizer_approx_rowwise_adagrad_with_counter_split_device_kernel.cuh',['../gen__embedding__optimizer__approx__rowwise__adagrad__with__counter__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5fdevice_5fkernel_2ecuh_236',['gen_embedding_optimizer_approx_rowwise_adagrad_with_weight_decay_split_device_kernel.cuh',['../gen__embedding__optimizer__approx__rowwise__adagrad__with__weight__decay__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fapprox_5fsgd_5fsplit_5fdevice_5fkernel_2ecuh_237',['gen_embedding_optimizer_approx_sgd_split_device_kernel.cuh',['../gen__embedding__optimizer__approx__sgd__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fdense_5fsplit_5fdevice_5fkernel_2ecuh_238',['gen_embedding_optimizer_dense_split_device_kernel.cuh',['../gen__embedding__optimizer__dense__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5flamb_5fsplit_5fdevice_5fkernel_2ecuh_239',['gen_embedding_optimizer_lamb_split_device_kernel.cuh',['../gen__embedding__optimizer__lamb__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5flars_5fsgd_5fsplit_5fdevice_5fkernel_2ecuh_240',['gen_embedding_optimizer_lars_sgd_split_device_kernel.cuh',['../gen__embedding__optimizer__lars__sgd__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fnone_5fsplit_5fdevice_5fkernel_2ecuh_241',['gen_embedding_optimizer_none_split_device_kernel.cuh',['../gen__embedding__optimizer__none__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fpartial_5frowwise_5fadam_5fsplit_5fdevice_5fkernel_2ecuh_242',['gen_embedding_optimizer_partial_rowwise_adam_split_device_kernel.cuh',['../gen__embedding__optimizer__partial__rowwise__adam__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fpartial_5frowwise_5flamb_5fsplit_5fdevice_5fkernel_2ecuh_243',['gen_embedding_optimizer_partial_rowwise_lamb_split_device_kernel.cuh',['../gen__embedding__optimizer__partial__rowwise__lamb__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5frowwise_5fadagrad_5fsplit_2ecpp_244',['gen_embedding_optimizer_rowwise_adagrad_split.cpp',['../gen__embedding__optimizer__rowwise__adagrad__split_8cpp.html',1,'']]], + ['gen_5fembedding_5foptimizer_5frowwise_5fadagrad_5fsplit_5fcuda_2ecu_245',['gen_embedding_optimizer_rowwise_adagrad_split_cuda.cu',['../gen__embedding__optimizer__rowwise__adagrad__split__cuda_8cu.html',1,'']]], + ['gen_5fembedding_5foptimizer_5frowwise_5fadagrad_5fsplit_5fdevice_5fkernel_2ecuh_246',['gen_embedding_optimizer_rowwise_adagrad_split_device_kernel.cuh',['../gen__embedding__optimizer__rowwise__adagrad__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5frowwise_5fadagrad_5fsplit_5fkernel_2ecu_247',['gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu',['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html',1,'']]], + ['gen_5fembedding_5foptimizer_5frowwise_5fadagrad_5fwith_5fcounter_5fsplit_5fdevice_5fkernel_2ecuh_248',['gen_embedding_optimizer_rowwise_adagrad_with_counter_split_device_kernel.cuh',['../gen__embedding__optimizer__rowwise__adagrad__with__counter__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fsplit_5fdevice_5fkernel_2ecuh_249',['gen_embedding_optimizer_rowwise_adagrad_with_weight_decay_split_device_kernel.cuh',['../gen__embedding__optimizer__rowwise__adagrad__with__weight__decay__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5frowwise_5fweighted_5fadagrad_5fsplit_5fdevice_5fkernel_2ecuh_250',['gen_embedding_optimizer_rowwise_weighted_adagrad_split_device_kernel.cuh',['../gen__embedding__optimizer__rowwise__weighted__adagrad__split__device__kernel_8cuh.html',1,'']]], + ['gen_5fembedding_5foptimizer_5fsgd_5fsplit_5fdevice_5fkernel_2ecuh_251',['gen_embedding_optimizer_sgd_split_device_kernel.cuh',['../gen__embedding__optimizer__sgd__split__device__kernel_8cuh.html',1,'']]], + ['generate_5fvbe_5fmetadata_2ecu_252',['generate_vbe_metadata.cu',['../generate__vbe__metadata_8cu.html',1,'']]], + ['get_5finfos_5fmetadata_2ecu_253',['get_infos_metadata.cu',['../get__infos__metadata_8cu.html',1,'']]] +]; diff --git a/search/files_6.js b/search/files_6.js new file mode 100644 index 000000000..e2829ed98 --- /dev/null +++ b/search/files_6.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['histogram_5fbinning_5fcalibration_5fops_2ecu_0',['histogram_binning_calibration_ops.cu',['../histogram__binning__calibration__ops_8cu.html',1,'']]] +]; diff --git a/search/files_7.js b/search/files_7.js new file mode 100644 index 000000000..9c2d40673 --- /dev/null +++ b/search/files_7.js @@ -0,0 +1,7 @@ +var searchData= +[ + ['input_5fcombine_2ecu_0',['input_combine.cu',['../input__combine_8cu.html',1,'']]], + ['input_5fcombine_2eh_1',['input_combine.h',['../input__combine_8h.html',1,'']]], + ['input_5fcombine_5fcpu_2ecpp_2',['input_combine_cpu.cpp',['../input__combine__cpu_8cpp.html',1,'']]], + ['input_5fcombine_5fgpu_2ecpp_3',['input_combine_gpu.cpp',['../input__combine__gpu_8cpp.html',1,'']]] +]; diff --git a/search/files_8.js b/search/files_8.js new file mode 100644 index 000000000..afb50db31 --- /dev/null +++ b/search/files_8.js @@ -0,0 +1,19 @@ +var searchData= +[ + ['jagged_5fdense_5fbmm_5fforward_2ecu_0',['jagged_dense_bmm_forward.cu',['../jagged__dense__bmm__forward_8cu.html',1,'']]], + ['jagged_5fdense_5fdense_5felementwise_5fadd_5fjagged_5foutput_5fforward_2ecu_1',['jagged_dense_dense_elementwise_add_jagged_output_forward.cu',['../jagged__dense__dense__elementwise__add__jagged__output__forward_8cu.html',1,'']]], + ['jagged_5fdense_5felementwise_5fmul_5fbackward_2ecu_2',['jagged_dense_elementwise_mul_backward.cu',['../jagged__dense__elementwise__mul__backward_8cu.html',1,'']]], + ['jagged_5fdense_5felementwise_5fmul_5fforward_2ecu_3',['jagged_dense_elementwise_mul_forward.cu',['../jagged__dense__elementwise__mul__forward_8cu.html',1,'']]], + ['jagged_5findex_5fadd_5f2d_5fforward_2ecu_4',['jagged_index_add_2d_forward.cu',['../jagged__index__add__2d__forward_8cu.html',1,'']]], + ['jagged_5findex_5fselect_5f2d_5fforward_2ecu_5',['jagged_index_select_2d_forward.cu',['../jagged__index__select__2d__forward_8cu.html',1,'']]], + ['jagged_5fjagged_5fbmm_5fforward_2ecu_6',['jagged_jagged_bmm_forward.cu',['../jagged__jagged__bmm__forward_8cu.html',1,'']]], + ['jagged_5fsoftmax_5fbackward_2ecu_7',['jagged_softmax_backward.cu',['../jagged__softmax__backward_8cu.html',1,'']]], + ['jagged_5fsoftmax_5fforward_2ecu_8',['jagged_softmax_forward.cu',['../jagged__softmax__forward_8cu.html',1,'']]], + ['jagged_5ftensor_5fops_2ecu_9',['jagged_tensor_ops.cu',['../jagged__tensor__ops_8cu.html',1,'']]], + ['jagged_5ftensor_5fops_5fautograd_2ecpp_10',['jagged_tensor_ops_autograd.cpp',['../jagged__tensor__ops__autograd_8cpp.html',1,'']]], + ['jagged_5ftensor_5fops_5fcpu_2ecpp_11',['jagged_tensor_ops_cpu.cpp',['../jagged__tensor__ops__cpu_8cpp.html',1,'']]], + ['jagged_5ftensor_5fops_5fmeta_2ecpp_12',['jagged_tensor_ops_meta.cpp',['../jagged__tensor__ops__meta_8cpp.html',1,'']]], + ['jagged_5fto_5fpadded_5fdense_5fbackward_2ecu_13',['jagged_to_padded_dense_backward.cu',['../jagged__to__padded__dense__backward_8cu.html',1,'']]], + ['jagged_5fto_5fpadded_5fdense_5fforward_2ecu_14',['jagged_to_padded_dense_forward.cu',['../jagged__to__padded__dense__forward_8cu.html',1,'']]], + ['jagged_5funique_5findices_2ecu_15',['jagged_unique_indices.cu',['../jagged__unique__indices_8cu.html',1,'']]] +]; diff --git a/search/files_9.js b/search/files_9.js new file mode 100644 index 000000000..751fbf1c4 --- /dev/null +++ b/search/files_9.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['keyed_5fjagged_5findex_5fselect_5fdim1_2ecu_0',['keyed_jagged_index_select_dim1.cu',['../keyed__jagged__index__select__dim1_8cu.html',1,'']]] +]; diff --git a/search/files_a.js b/search/files_a.js new file mode 100644 index 000000000..37958d9ba --- /dev/null +++ b/search/files_a.js @@ -0,0 +1,19 @@ +var searchData= +[ + ['layout_5ftransform_5fops_2ecu_0',['layout_transform_ops.cu',['../layout__transform__ops_8cu.html',1,'']]], + ['layout_5ftransform_5fops_2ecuh_1',['layout_transform_ops.cuh',['../layout__transform__ops_8cuh.html',1,'']]], + ['layout_5ftransform_5fops_5fcpu_2ecpp_2',['layout_transform_ops_cpu.cpp',['../layout__transform__ops__cpu_8cpp.html',1,'']]], + ['layout_5ftransform_5fops_5fgpu_2ecpp_3',['layout_transform_ops_gpu.cpp',['../layout__transform__ops__gpu_8cpp.html',1,'']]], + ['lfu_5fcache_5ffind_2ecu_4',['lfu_cache_find.cu',['../lfu__cache__find_8cu.html',1,'']]], + ['lfu_5fcache_5fpopulate_2ecu_5',['lfu_cache_populate.cu',['../lfu__cache__populate_8cu.html',1,'']]], + ['lfu_5fcache_5fpopulate_5fbyte_2ecpp_6',['lfu_cache_populate_byte.cpp',['../lfu__cache__populate__byte_8cpp.html',1,'']]], + ['lfu_5fcache_5fpopulate_5fbyte_2ecu_7',['lfu_cache_populate_byte.cu',['../lfu__cache__populate__byte_8cu.html',1,'']]], + ['linearize_5fcache_5findices_2ecpp_8',['linearize_cache_indices.cpp',['../linearize__cache__indices_8cpp.html',1,'']]], + ['linearize_5fcache_5findices_2ecu_9',['linearize_cache_indices.cu',['../linearize__cache__indices_8cu.html',1,'']]], + ['lru_5fcache_5ffind_2ecu_10',['lru_cache_find.cu',['../lru__cache__find_8cu.html',1,'']]], + ['lru_5fcache_5fpopulate_2ecu_11',['lru_cache_populate.cu',['../lru__cache__populate_8cu.html',1,'']]], + ['lru_5fcache_5fpopulate_5fbyte_2ecpp_12',['lru_cache_populate_byte.cpp',['../lru__cache__populate__byte_8cpp.html',1,'']]], + ['lru_5fcache_5fpopulate_5fbyte_2ecu_13',['lru_cache_populate_byte.cu',['../lru__cache__populate__byte_8cu.html',1,'']]], + ['lxu_5fcache_2ecpp_14',['lxu_cache.cpp',['../lxu__cache_8cpp.html',1,'']]], + ['lxu_5fcache_2ecu_15',['lxu_cache.cu',['../lxu__cache_8cu.html',1,'']]] +]; diff --git a/search/files_b.js b/search/files_b.js new file mode 100644 index 000000000..ebdaae84b --- /dev/null +++ b/search/files_b.js @@ -0,0 +1,14 @@ +var searchData= +[ + ['memory_5futils_2ecpp_0',['memory_utils.cpp',['../memory__utils_8cpp.html',1,'']]], + ['memory_5futils_2ecu_1',['memory_utils.cu',['../memory__utils_8cu.html',1,'']]], + ['memory_5futils_5fops_2ecpp_2',['memory_utils_ops.cpp',['../memory__utils__ops_8cpp.html',1,'']]], + ['memory_5futils_5fops_2ecu_3',['memory_utils_ops.cu',['../memory__utils__ops_8cu.html',1,'']]], + ['memory_5futils_5fops_5fcpu_2ecpp_4',['memory_utils_ops_cpu.cpp',['../memory__utils__ops__cpu_8cpp.html',1,'']]], + ['merge_5fpooled_5fembedding_5fops_5fcpu_2ecpp_5',['merge_pooled_embedding_ops_cpu.cpp',['../merge__pooled__embedding__ops__cpu_8cpp.html',1,'']]], + ['merge_5fpooled_5fembedding_5fops_5fgpu_2ecpp_6',['merge_pooled_embedding_ops_gpu.cpp',['../merge__pooled__embedding__ops__gpu_8cpp.html',1,'']]], + ['merge_5fpooled_5fembeddings_2eh_7',['merge_pooled_embeddings.h',['../merge__pooled__embeddings_8h.html',1,'']]], + ['metric_5fops_2ecu_8',['metric_ops.cu',['../metric__ops_8cu.html',1,'']]], + ['metric_5fops_2eh_9',['metric_ops.h',['../metric__ops_8h.html',1,'']]], + ['metric_5fops_5fhost_2ecpp_10',['metric_ops_host.cpp',['../metric__ops__host_8cpp.html',1,'']]] +]; diff --git a/search/files_c.js b/search/files_c.js new file mode 100644 index 000000000..c7fcaf676 --- /dev/null +++ b/search/files_c.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['ops_5futils_2eh_0',['ops_utils.h',['../ops__utils_8h.html',1,'']]] +]; diff --git a/search/files_d.js b/search/files_d.js new file mode 100644 index 000000000..2f24e1f69 --- /dev/null +++ b/search/files_d.js @@ -0,0 +1,14 @@ +var searchData= +[ + ['permute_5fpooled_5fembedding_5ffunction_2ecpp_0',['permute_pooled_embedding_function.cpp',['../permute__pooled__embedding__function_8cpp.html',1,'']]], + ['permute_5fpooled_5fembedding_5fops_2ecu_1',['permute_pooled_embedding_ops.cu',['../permute__pooled__embedding__ops_8cu.html',1,'']]], + ['permute_5fpooled_5fembedding_5fops_2eh_2',['permute_pooled_embedding_ops.h',['../permute__pooled__embedding__ops_8h.html',1,'']]], + ['permute_5fpooled_5fembedding_5fops_5fcpu_2ecpp_3',['permute_pooled_embedding_ops_cpu.cpp',['../permute__pooled__embedding__ops__cpu_8cpp.html',1,'']]], + ['permute_5fpooled_5fembedding_5fops_5fgpu_2ecpp_4',['permute_pooled_embedding_ops_gpu.cpp',['../permute__pooled__embedding__ops__gpu_8cpp.html',1,'']]], + ['permute_5fpooled_5fembedding_5fops_5fsplit_2ecu_5',['permute_pooled_embedding_ops_split.cu',['../permute__pooled__embedding__ops__split_8cu.html',1,'']]], + ['permute_5fpooled_5fembedding_5fops_5fsplit_2eh_6',['permute_pooled_embedding_ops_split.h',['../permute__pooled__embedding__ops__split_8h.html',1,'']]], + ['permute_5fpooled_5fembedding_5fops_5fsplit_5fcpu_2ecpp_7',['permute_pooled_embedding_ops_split_cpu.cpp',['../permute__pooled__embedding__ops__split__cpu_8cpp.html',1,'']]], + ['permute_5fpooled_5fembedding_5fops_5fsplit_5fgpu_2ecpp_8',['permute_pooled_embedding_ops_split_gpu.cpp',['../permute__pooled__embedding__ops__split__gpu_8cpp.html',1,'']]], + ['permute_5fpooled_5fembs_5ffunction_2eh_9',['permute_pooled_embs_function.h',['../permute__pooled__embs__function_8h.html',1,'']]], + ['permute_5fpooled_5fembs_5ffunction_5fsplit_2eh_10',['permute_pooled_embs_function_split.h',['../permute__pooled__embs__function__split_8h.html',1,'']]] +]; diff --git a/search/files_e.js b/search/files_e.js new file mode 100644 index 000000000..6e36be359 --- /dev/null +++ b/search/files_e.js @@ -0,0 +1,15 @@ +var searchData= +[ + ['quantize_5fbfloat16_2ecu_0',['quantize_bfloat16.cu',['../quantize__bfloat16_8cu.html',1,'']]], + ['quantize_5ffp8_5frowwise_2ecu_1',['quantize_fp8_rowwise.cu',['../quantize__fp8__rowwise_8cu.html',1,'']]], + ['quantize_5ffused_5f8bit_5frowwise_2ecu_2',['quantize_fused_8bit_rowwise.cu',['../quantize__fused__8bit__rowwise_8cu.html',1,'']]], + ['quantize_5ffused_5fnbit_5frowwise_2ecu_3',['quantize_fused_nbit_rowwise.cu',['../quantize__fused__nbit__rowwise_8cu.html',1,'']]], + ['quantize_5fhfp8_2ecu_4',['quantize_hfp8.cu',['../quantize__hfp8_8cu.html',1,'']]], + ['quantize_5fmsfp_2ecu_5',['quantize_msfp.cu',['../quantize__msfp_8cu.html',1,'']]], + ['quantize_5fops_2ecuh_6',['quantize_ops.cuh',['../quantize__ops_8cuh.html',1,'']]], + ['quantize_5fops_5fcpu_2ecpp_7',['quantize_ops_cpu.cpp',['../quantize__ops__cpu_8cpp.html',1,'']]], + ['quantize_5fops_5fgpu_2ecpp_8',['quantize_ops_gpu.cpp',['../quantize__ops__gpu_8cpp.html',1,'']]], + ['quantize_5fops_5fmeta_2ecpp_9',['quantize_ops_meta.cpp',['../quantize__ops__meta_8cpp.html',1,'']]], + ['quantize_5fops_5futils_2eh_10',['quantize_ops_utils.h',['../quantize__ops__utils_8h.html',1,'']]], + ['quantize_5fpadded_5ffp8_5frowwise_2ecu_11',['quantize_padded_fp8_rowwise.cu',['../quantize__padded__fp8__rowwise_8cu.html',1,'']]] +]; diff --git a/search/files_f.js b/search/files_f.js new file mode 100644 index 000000000..2a02461e5 --- /dev/null +++ b/search/files_f.js @@ -0,0 +1,5 @@ +var searchData= +[ + ['radix_5fsort_5fpairs_2ecu_0',['radix_sort_pairs.cu',['../radix__sort__pairs_8cu.html',1,'']]], + ['reset_5fweight_5fmomentum_2ecu_1',['reset_weight_momentum.cu',['../reset__weight__momentum_8cu.html',1,'']]] +]; diff --git a/search/functions_0.js b/search/functions_0.js index 41eb52c4a..c51e3cc5b 100644 --- a/search/functions_0.js +++ b/search/functions_0.js @@ -1,9 +1,67 @@ var searchData= [ - ['_5fbfloat16_5fto_5ffloat_5fgpu_0',['_bfloat16_to_float_gpu',['../group__quantize-ops-cuda.html#ga2076a59fd190690f67c1eddb79b6acc4',1,'fbgemm_gpu']]], - ['_5ffloat_5fto_5fbfloat16_5fgpu_1',['_float_to_bfloat16_gpu',['../group__quantize-ops-cuda.html#ga2f1cc4b6dc6f708324855f94d558cfc1',1,'fbgemm_gpu']]], - ['_5ffloat_5fto_5fhfp8_5fgpu_2',['_float_to_hfp8_gpu',['../group__quantize-ops-cuda.html#gab2837424e3774fe34ba255658554a75a',1,'fbgemm_gpu']]], - ['_5ffloat_5fto_5fmsfp_5fgpu_3',['_float_to_msfp_gpu',['../group__quantize-ops-cuda.html#ga427f81e1d8901e2fafc9611860fbd4d5',1,'fbgemm_gpu']]], - ['_5fhfp8_5fto_5ffloat_5fgpu_4',['_hfp8_to_float_gpu',['../group__quantize-ops-cuda.html#ga03a8f8825a16c6235b699886fa46e1f6',1,'fbgemm_gpu']]], - ['_5fmsfp_5fto_5ffloat_5fgpu_5',['_msfp_to_float_gpu',['../group__quantize-ops-cuda.html#gac0c20377454dbfafcc5ac245fe6427ce',1,'fbgemm_gpu']]] + ['_5f_5falign_5f_5f_0',['__align__',['../namespacefbgemm__gpu.html#a9a25aa8cfdd2801c4576fb7111ca1e34',1,'fbgemm_gpu::__align__(32) float8'],['../namespacefbgemm__gpu.html#ac5ef7f218ca22e4dd93d4161458006f6',1,'fbgemm_gpu::__align__(64) float_16'],['../namespacefbgemm__gpu.html#a5365b81a771afde2d770210e45b73bdb',1,'fbgemm_gpu::__align__(8) half4'],['../namespacefbgemm__gpu.html#ad5af23eb5e28d14f6089e7a18b0ed0d5',1,'fbgemm_gpu::__align__(16) half8']]], + ['_5f_5flaunch_5fbounds_5f_5f_1',['__launch_bounds__',['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#a6e4504b4f1023565bf18ac29f304f165',1,'__launch_bounds__(kMaxThreads) void batch_index_select_dim0_codegen_backward_kernel_cta_per_row(const pta: gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#a419781019c14d9d59041ca2a127d2c1a',1,'__launch_bounds__(kMaxThreads) void batch_index_select_dim0_codegen_backward_kernel_cta_per_row< uint8_t: gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__warp_8cu.html#af1eb0a147a3656c72bff10b68454c23b',1,'__launch_bounds__(kBackwardMaxThreads) void batch_index_select_dim0_codegen_backward_kernel_warp_per_row(const pta: gen_batch_index_select_dim0_backward_kernel_warp.cu'],['../gen__batch__index__select__dim0__backward__kernel__warp_8cu.html#a422cac14ead186e7d1ffdea24dbb41a2',1,'__launch_bounds__(kBackwardMaxThreads) void batch_index_select_dim0_codegen_backward_kernel_warp_per_row< uint8_t: gen_batch_index_select_dim0_backward_kernel_warp.cu'],['../gen__batch__index__select__dim0__forward__kernel_8cu.html#afe75d514238f01862b4416d072a457ab',1,'__launch_bounds__(kForwardMaxThreads) __global__ void batch_index_select_dim0_codegen_forward_kernel(const pta: gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__batch__index__select__dim0__forward__kernel_8cu.html#a794e5a8311030e080f19bcaf98cbaa3e',1,'__launch_bounds__(kForwardMaxThreads) __global__ void batch_index_select_dim0_codegen_forward_kernel< uint8_t: gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html#aab67c7ff63262ed7ee2955ab54fd6cdb',1,'__launch_bounds__(kForwardMaxThreads) __global__ void batch_index_select_dim0_codegen_forward_small_kernel(const pta: gen_batch_index_select_dim0_forward_kernel_small.cu'],['../gen__batch__index__select__dim0__forward__kernel__small_8cu.html#a09ab46cf824219bc6c7ca9a47e3d90cd',1,'__launch_bounds__(kForwardMaxThreads) __global__ void batch_index_select_dim0_codegen_forward_small_kernel< uint8_t: gen_batch_index_select_dim0_forward_kernel_small.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#aec97e553558684266790dc906158a105',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adagrad_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#a60482659dcb929a1f6a60dda564f4cdc',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adagrad_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#a50cb7dfbe0185fcbd26cfd0156710acc',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adagrad_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#a0ed9968b042349d756a20bfc8c31c22d',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adagrad_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a7f198a235aa56925b36d48d029f9a26a',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_adagrad_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#abb3af3ab6c99e8609b2199129b2a6c3d',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_adagrad_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a2ca5c0c3b7f03146b0739206987a8efb',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_adagrad_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#ad265ff9fd07f592055eb413d73ff59a3',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_adagrad_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#a479b62e3a680d0eb604b0d99c497dc44',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adagrad_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#a16936797cd22aeea32b40dcc55e1d73f',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adagrad_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#a665c5d75524a34cec6f5b5258b182d7a',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adagrad_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#a7729be76298454212379af9803e78cf9',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adagrad_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#aeb6425d7cade524ae83445d8ffcad95a',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adam_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#ada6a5fbef27c4a4a31a9b8794e15442e',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adam_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#a8a0814be275ca40dd482231bf8be61ef',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adam_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#abcaa8e0b99a97add31e16f0454bd57d3',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adam_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#a30fd75bf7de9f2dd4c1af90a76cc4cab',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_adam_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#a43a11629fc716aa3fc2efce282ade1bf',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_adam_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#a1e6a8699bf2c46477da50582e38ee237',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_adam_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#ae3a9242f5ffd888400f08b8c1662cc61',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_adam_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#aa7724fd36f338edda8cec8fbce0dcc3f',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adam_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#aaa0f0d28eaca058bde829af48b4a9b93',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_adam_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#a141a421e122929281f3a968d7181075d',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adam_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#aa53241ccd067fda3b4f745364d104ae7',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_adam_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#aef74039cc67d8a29f2964dd2ead5c884',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a8005f4419a0e99b1adc8ba836e2bacc4',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#af4e9ad9da78c796024828e400596398e',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#abbbfbac2a0d5a12edfd4fa6e476f5089',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a727c25d68451d781ee3328a76b544770',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a942a80794035682b67bf75531af7ea76',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a02950b6e35152a847c545ef90af6c315',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a46c9fa7a8cf628e30c5bcbd6713846b2',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#adf6d412fe63bcfdcd84fc4e45f616217',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a360c18a2f091431cf7f15e6ac14e848a',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a60ab111bc496bd3b843b3d73350f6695',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#abe773e17b7f19a70a10efe7bf1763c07',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__indice__weights__codegen__cuda_8cu.html#ad49c5c5e6c69ba836c2c3728d383cd5c',1,'__launch_bounds__(kForwardMaxThreads) void dense_embedding_codegen_grad_indice_weights_kernel(const pta: gen_embedding_backward_dense_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#a71d10fab767a3f6a4c9845432b7c673b',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_dense_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#a134107427281e66b9bdc1f05e0ed2006',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_dense_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__warp_8cu.html#a321e6c7a5bc2c920f083dadb4d023bae',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_dense_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_dense_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__warp_8cu.html#a705c39686bcf17986ce0182b31944a82',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_dense_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_dense_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#a8f6c6ea91c21be19960e453b8f83698b',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_dense_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#aa068d67521003fac6c5013d12698b228',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_dense_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__warp_8cu.html#a4d92990636a3fcdbe762a413cc96c642',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_dense_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_dense_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__warp_8cu.html#a9629d38b5ab429da94bb1d5099042123',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_dense_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_dense_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#a044189dd94a5b69db982c5e78a8258f4',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_dense_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#a44794beb7b535ee85a06027407e9578d',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_dense_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#aec8fd1dccb91dec69eee635d8cc8cae3',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_dense_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#ae68abaaf02536c2e20decd2ca4daef60',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_dense_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#aa9d0b42dc9a6b6c25005e5adc6a412e3',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lamb_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#af2bc792b1cf28a27ebfc0866b059fa81',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lamb_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#a502bf7dfa5a02ec71b77763a65ec91c5',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lamb_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#a8026675b09ae447bd48ab0a854ea28bf',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lamb_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#aac66a737c59bab7e9f767b1e38d5f1d3',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_lamb_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#afd63238f6b7c4a1e468568bda42bb3e0',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_lamb_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a4e2287d8d0e80b53a592337a64570d66',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_lamb_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a11d09ebd4c4b65fb35d265de845d73fc',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_lamb_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#afa2ba02eba70da5c0a8fdcd8509e7e77',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lamb_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#a61e7f43722eeda4e4234e1af525ae46e',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lamb_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#afcf8fbf4f5013c1082ce86fa5c3a5fd4',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lamb_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#ab184e11501d6d031e538c60ef66a8342',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lamb_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#a8ea3bce56ea941e3716f81220ab88fe5',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lars_sgd_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#aaa9702e86f8ed1788c7796017bdd404c',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lars_sgd_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#a0c5ac630cac3e582871b2521984d3691',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lars_sgd_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#af7be11e596974198a45beaacc4d9db0f',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lars_sgd_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a1be3f22e4eb6db21e09d922580c54faf',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_lars_sgd_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a84b86dee7ee70d3e3ba5ae6f466c6f0e',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_lars_sgd_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a89a6fc31913b2347216065f4655b82ff',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_lars_sgd_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a584ad4898a3e03f279eb3a39c419735e',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_lars_sgd_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#aa2038bd1822625bd55a38eed4240c39a',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lars_sgd_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#ae41fadf6abfe1e00dccedd18b90dab32',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_lars_sgd_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#a9ba65eca59bd0b29e87b4adb5a444d1b',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lars_sgd_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#a60776cad67cb695e9768c1ce170aed12',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_lars_sgd_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#adb6e98291bfdb46d09389b2b453e54b2',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_none_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#aab6d47d46fccfb5d973f8ff2a44bff7a',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_none_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__warp_8cu.html#a9d6d3f6a070db2a520adb97ff89e7f1c',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_none_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_none_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__warp_8cu.html#adda6d498fce399be1bb4ff6c884cd325',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_none_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_none_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#a3c38980139cb0d10bc2d195479a69fb4',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_none_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#a55340037f2150aa438d4cb6675412e7e',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_none_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__warp_8cu.html#ae53e64a9190921226cba0e54595de4af',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_none_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_none_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__warp_8cu.html#a6c8f5295879f30dac04285180744b05b',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_none_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_none_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#a49683c14f18b75fafd2a5ce3f90c7d61',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_none_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#a78a9d364ed7043a1412228b17a0406a1',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_none_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#acd2e3179752c56bfdde47a8ad7a00220',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_none_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#a4d483ee9ae74898f27f8070e41c4fced',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_none_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#a440dc2054a1346ad291f617540be2e25',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#afaf745a30243c0c755429f1b1d465f2d',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#ab435787b7738dff4daa1eca5ed8725dd',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#a809fb9fd9f5386090e58c2fdd7f05bbd',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#a250b8485cb708a3fe1d789613014b238',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#a4996180a982a92cb9151e2557777d77a',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#ae96e8ad601ac1adb859d3aec074bb439',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#ae28f985f3c5d59410f3fd6c2a99d9320',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#a59f00431d3950b72f6e7d89baf3fde0b',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#a2f233fc13ea7dbc092ed3c22b2bf1a7f',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#a965b9c456ca6a6dffb664f585401250d',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#a69d372f391200ef3cafedad093a5470f',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_adam_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#ad23ff52f91efba0cbff48134c3a42bc4',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#aa9475900cea03cb0a61e0e16932e01a4',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#aeea6e4ebbd44a284f8e1078cf3efdaad',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#a75a810317afae4c2a93af95f80855d42',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a41774547fd61442443c1967f1a8e8b13',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#aa41bfc39f4114bbad7186e4b9b480da3',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a84d2573599cb14db8200acded518dd53',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#ad3410f599c95c3268541e72f9684f82b',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#aee08a6146cbf90f361a828e6d2ff4ede',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#a12bca8c5fdd115d24668beab2bb8ea27',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#ae203f025f99b18448dfd355a519c4121',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#a87a9718ff816d6e1bdd9dca8e067e341',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_partial_rowwise_lamb_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#a84dcaeb939254f551d6c356d1eca8747',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#af8829bdb0d543a40bb769900d36ea13e',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#ae9187ee78b193e34f92875da955dc6de',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#ae7b604d06f2afe4b8d99b94b6a7ca46f',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a65ebc941a004af813be547c2114c6eca',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#aaba75f921548599cff242a4033a381c9',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#ad0d4a168e8e591add8c872d4c2fff64a',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#afbc119c8f230ecbf041ca9d852021a4a',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#a634a690ed27c50d8308bcc0a9bf85acc',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_vbe_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#ad38fb7b8c66635da0517434c661ef2e2',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_vbe_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#ac9d49c8094b87daf6025d9195437119e',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_vbe_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#a86d693b20d7be5e068994e693d970104',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_unweighted_vbe_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#ad7474c2dcf75a987f9526e730542ae16',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#a76005fdee1a342df4b951b9191967576',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#a73ddb7ffe3131b43c027bed87a21da0c',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#ad386be3805dc66bcebfcc75ae6ce20ce',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#a557205856561135a510a45e915bc0714',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_vbe_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#a866fa5e6f036f9befaef0a014527b214',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_vbe_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#a15977bf39e5dbde54bc2d1176a9272b9',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_vbe_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#a4c67d23288adf2fc636e9db4c30bfa5e',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_weighted_vbe_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#a18e29f7653534f3a75e41cf3056d2634',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#ad0f3c1412b7b4ddb2f3c5262b27f5b46',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#a56d820ab8e2e5c1e815ecbe5e906075e',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#aaecc4ec4c793272693a37f0e027dfb93',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#a254195fbabfff3c3ad9ba04db100afae',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#a2fdb05c57c2efe83f57ce0ccfe97f861',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#a28a51c35ffb6aac4d6b35c9b87960129',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#a303830fd0513ecd4eb232556376ad2ff',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#abacbb190c3b418788aa37c065b93e703',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#a6f94595430b5a0e8c1597b72f210095f',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#ae73b050da138bd46bcb186f630a45f1e',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#a0f05baa1d7dca3d78338fcd70e11487c',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a1296e33305fd2cde7e9e34e18e7e7905',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a49dd26094cead9644cbc35c29bb5bb21',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#a709a4f70083ce173ce40562aa52ad3c8',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#abf79428f3dcf0b60bcff9074d587aeaf',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a99c23e8020a9ae93a0d0d429c6940707',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a920aba769ec4eba77d74c4cce2f0aa5a',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#aacedf2a727684a316ae18abf5670f8e8',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a9354545fca8047a3359cc39269e4531f',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a158fb407fba50cda959d3a60cbc01d91',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a0344106c25fea0c6358540ff4bd536f8',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#ade29dc18e73de993e107177d9568fbdf',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a913d8fc72158bf301f064c0e60657a18',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#a79b96d6a0be54ea86ebd1cadeedd2068',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#a66e6adb0beac238f39d443dffa3c0161',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#ad4cbc31bac8a8d965f3549045cd85999',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#a30ae1e9efc40a515dca89e5e3ef46565',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a4987b540b661f1caa132231f415c45a9',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#abeb949f70e925c2f8011d973d75645fc',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a08d8db556761e8e68193b2cc8a32a1cc',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#ac9a5abe82611fbf748e346094a7b24b2',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#ae73620aca9ffc6e0cfd3b9cb594bdaf0',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#a8275b2b19c2713679e0404cfc50cfc4f',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#a2af51d716ed8d2b1a926e0f237b76f71',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#ad63ba5d695275d09b7f72a2e3fc6c124',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#afd015e1d0e79f14de8ed5bdf578c81df',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#abc14cf31cc4a8f906bc7f25d594fafc1',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#a63e7a313c891f643c307bd05041a5b54',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#aaddcf08714b3cc33953d207c24e0be7f',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a320b7cb4717a06125d1e05149e7414a9',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_sgd_unweighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a6736b927e85af06f2a8f64b95a527f35',1,'__launch_bounds__(kMaxThreads) void split_embedding_nobag_backward_codegen_sgd_unweighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#aec3f0f560b496881e95413f483dc0c32',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_sgd_unweighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a1bac18bde859aad7fbfb3871a0bacf37',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_nobag_backward_codegen_sgd_unweighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#aaa0317297f080a5b537f22049d8ecbbe',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_vbe_kernel_cta_per_row_1(const pta: gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#a9c866240eb5eb8df0da4e1ee803e04cf',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_vbe_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#a040a74b95b542902bfb38bacd03202eb',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_vbe_kernel_warp_per_row_1(const pta: gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#aa77ffcc8cedf9fe2668e96e9305bdccb',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_unweighted_vbe_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#a02bd16452698dd0ae512e183e1ed25bb',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_weighted_kernel_cta_per_row_1(const pta: gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#aab426569c3d6a90703854ec88079c3cf',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_weighted_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#ab2b8f92ece6c5a09d11a65969626378d',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_weighted_kernel_warp_per_row_1(const pta: gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#ac60290f3d38a825226fe8014a9274e3d',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_weighted_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#a4ca2ae3bf6df90dd1f3a4bf8b534231e',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_weighted_vbe_kernel_cta_per_row_1(const pta: gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#ac83482e2c195bd6662609604217a4903',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_sgd_weighted_vbe_kernel_cta_per_row_1< uint8_t: gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#a412bd503e722e4451e55ef89a4bb3649',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_weighted_vbe_kernel_warp_per_row_1(const pta: gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#a5cbbacf7ec8ecfad9f032e7217474f71',1,'__launch_bounds__(kBackwardMaxThreads) void split_embedding_backward_codegen_sgd_weighted_vbe_kernel_warp_per_row_1< uint8_t: gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__split__grad_8cu.html#a2dd7fc517b5148ca80cff10cd7cbcaed',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_find_long_segments(const pta: gen_embedding_backward_split_grad.cu'],['../gen__embedding__backward__split__grad_8cu.html#aea453d06a5b06a7263bbb3c3c598b805',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_count_unique_indices_kernel(const pta: gen_embedding_backward_split_grad.cu'],['../gen__embedding__backward__split__grad_8cu.html#acfcb5a71381871c2d136a1e7ffc68b4c',1,'__launch_bounds__(kMaxThreads) void grad_mean_vbe_kernel(pta: gen_embedding_backward_split_grad.cu'],['../gen__embedding__backward__split__grad_8cu.html#a9cbee37a9474b3f03b3e585c448b63ee',1,'__launch_bounds__(kMaxThreads) void grad_mean_kernel(pta: gen_embedding_backward_split_grad.cu'],['../gen__embedding__backward__split__indice__weights__codegen__cuda_8cu.html#a422182213e14442c911aa3ba3ed18a58',1,'__launch_bounds__(kForwardMaxThreads) void split_embedding_codegen_grad_indice_weights_vbe_kernel(const pta: gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#ab27358be96fd39a3d879e0e3f942c616',1,'__launch_bounds__(kForwardMaxThreads) __global__ void dense_embedding_codegen_forward_unweighted_kernel(const pta: gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#a830a55ef37b6607a42e4b4cbb6889aa5',1,'__launch_bounds__(kForwardMaxThreads) __global__ void dense_embedding_codegen_forward_unweighted_kernel< uint8_t: gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#a0178272d43da8f09567a976c98e4617c',1,'__launch_bounds__(kForwardMaxThreads) __global__ void dense_embedding_nobag_codegen_forward_unweighted_kernel(const pta: gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#a13a4edf8545bd07a774fe7420e8d397b',1,'__launch_bounds__(kForwardMaxThreads) __global__ void dense_embedding_nobag_codegen_forward_unweighted_kernel< uint8_t: gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html#aa128173842fe96c64a581b2efdd5fe7e',1,'__launch_bounds__(kForwardMaxThreads) __global__ void dense_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta: gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel__small_8cu.html#a38384adec04c76c7f4267c8c1cdc7ff7',1,'__launch_bounds__(kForwardMaxThreads) __global__ void dense_embedding_nobag_codegen_forward_unweighted_small_kernel< uint8_t: gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#a40c420d5aadf8202b8a9de25931c44ff',1,'__launch_bounds__(kForwardMaxThreads) __global__ void dense_embedding_codegen_forward_weighted_kernel(const pta: gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#aa272d7ae5549cc1f16cb4761f3edf890',1,'__launch_bounds__(kForwardMaxThreads) __global__ void dense_embedding_codegen_forward_weighted_kernel< uint8_t: gen_embedding_forward_dense_weighted_kernel.cu'],['../namespacenbit.html#adf462393afe5c0c395c48cf4f889c6f8',1,'nbit::__launch_bounds__(WarpsPerBlock *kWarpSize) __global__ void FP16_split_embedding_codegen_forward_unweighted_kernel_small_L(const pta'],['../namespacenbit.html#aced6599a5180c2faaff5bbb9bc92f147',1,'nbit::__launch_bounds__(4 *kWarpSize) __global__ void FP16_split_embedding_codegen_forward_unweighted_kernel_small_L< int32_t'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#ab08dd38a042ee1b012a6db152e28df6d',1,'__launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_v2_kernel(const emb_t *__restrict__ const dev_weights: gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a41deb3b48278a02504f49a2a3dc15cd8',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_kernel(const pta: gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a5ea0ab17f6d9eefd8f00e171c4d8b424',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_kernel< uint8_t: gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a9b1f7936d16c021a06b52e10047d17c9',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_kernel(const pta: gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#ae658cdd019bf968ffa65e519118af108',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_kernel< uint8_t: gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html#af345685cdddd68d8304b0804863bc611',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel(const pta: gen_embedding_forward_split_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel__small_8cu.html#a4c26c8149d8b4a96823082303a657531',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_nobag_codegen_forward_unweighted_small_kernel< uint8_t: gen_embedding_forward_split_unweighted_nobag_kernel_small.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a98033ae44aee4b9db7201fdad50c28db',1,'__launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_v2_kernel(const emb_t *__restrict__ const dev_weights: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a640269bb96d2014f8c117163f09d8228',1,'__launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_v2_kernel< uint8_t: gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#a718566769c1ceda303b72d8876532ea6',1,'__launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_v2_kernel(const emb_t *__restrict__ const dev_weights: gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a02d4931cef892bdaf44d3ab510f0d655',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_vbe_kernel(const pta: gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a95e359c3e33b1c2fcc6bb83a101c998f',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_codegen_forward_unweighted_vbe_kernel< uint8_t: gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#a993a3437f132715df009e8cdd7a12806',1,'__launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_v2_kernel(const emb_t *__restrict__ const dev_weights: gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#a0f7cdacc2963885ca7eddcf74c44c1e7',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_kernel(const pta: gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#a183af91deddd1a5f4c5d1657476d2594',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_kernel< uint8_t: gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a993a3437f132715df009e8cdd7a12806',1,'__launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_v2_kernel(const emb_t *__restrict__ const dev_weights: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a7cf7d29de243a1d3d643b7f99420ca73',1,'__launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_v2_kernel< uint8_t: gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#a993a3437f132715df009e8cdd7a12806',1,'__launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_v2_kernel(const emb_t *__restrict__ const dev_weights: gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a2b31286ebfaa57f2a8e43418dc0cc2bc',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_vbe_kernel(const pta: gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a4e5e2097a867f5ac61d945360d16e1ed',1,'__launch_bounds__(kForwardMaxThreads) __global__ void split_embedding_codegen_forward_weighted_vbe_kernel< uint8_t: gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#ab54a42bb86f9a913d382b4938e3b023f',1,'__launch_bounds__(kMaxThreads) void split_rowwise_adagrad_update_kernel(at: gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu'],['../gen__embedding__optimizer__rowwise__adagrad__split__kernel_8cu.html#a34f0bcf2172442db1cd089b529e81d11',1,'__launch_bounds__(kMaxThreads) void split_rowwise_adagrad_update_kernel< uint8_t: gen_embedding_optimizer_rowwise_adagrad_split_kernel.cu'],['../embedding__backward__split__grad__template_8cu.html#a2dd7fc517b5148ca80cff10cd7cbcaed',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_codegen_find_long_segments(const pta: embedding_backward_split_grad_template.cu'],['../embedding__backward__split__grad__template_8cu.html#aea453d06a5b06a7263bbb3c3c598b805',1,'__launch_bounds__(kMaxThreads) void split_embedding_backward_count_unique_indices_kernel(const pta: embedding_backward_split_grad_template.cu'],['../embedding__backward__split__kernel__cta__template_8cu.html#a436fa7b0b61202c628c4ca50bc9b1bcd',1,'__launch_bounds__(kMaxThreads) void: embedding_backward_split_kernel_cta_template.cu'],['../embedding__backward__split__kernel__warp__template_8cu.html#aa63bd2cb4cfc6b18191236e0a85bdd26',1,'__launch_bounds__(kBackwardMaxThreads) void: embedding_backward_split_kernel_warp_template.cu'],['../embedding__backward__split__template_8cu.html#a436fa7b0b61202c628c4ca50bc9b1bcd',1,'__launch_bounds__(kMaxThreads) void: embedding_backward_split_template.cu'],['../embedding__bounds__check_8cu.html#a9fcdcf37685cd2ec9b88dfac7e77aaaa',1,'__launch_bounds__(kMaxThreads) void bounds_check_indices_kernel(const at: embedding_bounds_check.cu'],['../namespacenbit.html#a0a75b5eade7f9536629ce45b5827fb31',1,'nbit::__launch_bounds__()'],['../embedding__forward__split__kernel__nobag__small__template_8cu.html#a5c289e92014011ec16430dabf2272ae8',1,'__launch_bounds__(kForwardMaxThreads) __global__ void: embedding_forward_split_kernel_nobag_small_template.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a9bbd05d6885ea75e7564678a52104538',1,'__launch_bounds__(kForwardMaxThreads, 2048/kForwardMaxThreads) __global__ void split_embedding_codegen_forward_: embedding_forward_split_kernel_v2_template.cu'],['../embedding__optimizer__split__kernel__template_8cu.html#a69cc59925f75e23b97fe9e48e72bb900',1,'__launch_bounds__(kMaxThreads) void split_: embedding_optimizer_split_kernel_template.cu'],['../embedding__optimizer__split__template_8cu.html#a69cc59925f75e23b97fe9e48e72bb900',1,'__launch_bounds__(kMaxThreads) void split_: embedding_optimizer_split_template.cu'],['../bench__utils_8cuh.html#a59e0073dcf6e90b2d7a7b38f6210cb50',1,'__launch_bounds__(kMaxThreads) void flush_gpu(char *d_flush: bench_utils.cuh'],['../namespacefbgemm__gpu.html#a17d5a2e40c83e6e3f5c68e375bf468f7',1,'fbgemm_gpu::__launch_bounds__(kMaxThreads) __global__ void embedding_inplace_update_kernel(at'],['../namespacefbgemm__gpu.html#ac93e7c311a1d26fbe8815c8b34a6bde4',1,'fbgemm_gpu::__launch_bounds__(kMaxThreads) void pruned_array_lookup_from_row_idx_kernel(const at'],['../namespacefbgemm__gpu.html#a50af77e9607a7a96addff8aa8e5e4508',1,'fbgemm_gpu::__launch_bounds__(kMaxThreads) void to_dense_segment_value_kernel(const int64_t num_lengths'],['../namespacefbgemm__gpu.html#a28846f89e09ae2fc064e73142d83ceef',1,'fbgemm_gpu::__launch_bounds__(kMaxThreads) void jagged_dense_bmm_kernel(const pta'],['../namespacefbgemm__gpu.html#ad21c70bdd84772ee2b9b3950c87e9791',1,'fbgemm_gpu::__launch_bounds__(kMaxThreads) void jagged_jagged_elementwise_dense_output_kernel_(const pta'],['../namespacefbgemm__gpu.html#afd2e24ffed8f057a2092d699b4cb3cb0',1,'fbgemm_gpu::__launch_bounds__(kMaxThreads) void jagged_index_add_2d_kernel(at'],['../namespacefbgemm__gpu.html#ac59415a66e49753fb42195f0d816c7c2',1,'fbgemm_gpu::__launch_bounds__(kMaxThreads) void _block_bucketize_sparse_features_cuda_kernel2(int lengths_size'],['../transpose__embedding__input_8cu.html#a91943a24b789081d81916b94ee7789ad',1,'__launch_bounds__(kMaxThreads) void linearize_index_kernel(const at: transpose_embedding_input.cu'],['../transpose__embedding__input_8cu.html#aee01a74e30c13b20ffba0c0737c44425',1,'__launch_bounds__(kMaxThreads) void linearize_index_index_select_kernel(const at: transpose_embedding_input.cu'],['../ssd__split__embeddings__cache__cuda_8cu.html#a7d15f4b6131224480844be177fe6b28d',1,'__launch_bounds__(kMaxThreads) void masked_index_put_kernel(at: ssd_split_embeddings_cache_cuda.cu'],['../ssd__split__embeddings__cache__cuda_8cu.html#aac79184e1b6e3d831580eba191b6da2e',1,'__launch_bounds__(kMaxThreads) void masked_index_put_kernel(at: ssd_split_embeddings_cache_cuda.cu']]], + ['_5fbfloat16_5fto_5ffloat_5fcpu_2',['_bfloat16_to_float_cpu',['../namespacefbgemm__gpu.html#ad8c67a657c3008d1d87472f216f7908f',1,'fbgemm_gpu']]], + ['_5fbfloat16_5fto_5ffloat_5fgpu_3',['_bfloat16_to_float_gpu',['../group__quantize-ops-cuda.html#ga2076a59fd190690f67c1eddb79b6acc4',1,'fbgemm_gpu']]], + ['_5fblock_5fbucketize_5fsparse_5ffeatures_5fcpu_4',['_block_bucketize_sparse_features_cpu',['../namespacefbgemm__gpu.html#adaf7cd0195ff361555f35a017c018d25',1,'fbgemm_gpu']]], + ['_5fbucketize_5fsparse_5ffeatures_5fcpu_5',['_bucketize_sparse_features_cpu',['../namespacefbgemm__gpu.html#a1f2b214db9aa3f8887c267c0ea9f5edf',1,'fbgemm_gpu']]], + ['_5fcat_5fint_5ftensors_6',['_cat_int_tensors',['../namespacefbgemm__gpu.html#acd8fa4397185c592f5eac101b42504a6',1,'fbgemm_gpu']]], + ['_5fcat_5fint_5ftensors_5fwith_5fpadding_7',['_cat_int_tensors_with_padding',['../namespacefbgemm__gpu.html#a1376d05f5d6efb4fbdb869e391702adf',1,'fbgemm_gpu']]], + ['_5fcat_5fper_5fsample_5fweights_5flist_8',['_cat_per_sample_weights_list',['../namespacefbgemm__gpu.html#a0eec17207e4a69da15dae845d02721e5',1,'fbgemm_gpu']]], + ['_5fexpand_5finto_5fjagged_5fpermute_5fcpu_5fkernel_9',['_expand_into_jagged_permute_cpu_kernel',['../namespacefbgemm__gpu.html#ac339123bb72d7421fca2d2b56821f02a',1,'fbgemm_gpu']]], + ['_5ffloat_5for_5fhalf_5fto_5ffusednbitrowwise_5fgpu_10',['_float_or_half_to_fusednbitrowwise_gpu',['../group__sparse-data-cuda.html#ga3b963d0e45c2bc0060aaa974efe64b8a',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5fbfloat16_5fcpu_11',['_float_to_bfloat16_cpu',['../namespacefbgemm__gpu.html#a51665269174ef625316e519465a67839',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5fbfloat16_5fgpu_12',['_float_to_bfloat16_gpu',['../group__quantize-ops-cuda.html#ga2f1cc4b6dc6f708324855f94d558cfc1',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5ffp8rowwise_5fgpu_13',['_float_to_FP8rowwise_gpu',['../group__quantize-ops-cuda.html#ga31b9029d43a60ad1fc90dc6ec54af9db',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5ffp8rowwise_5fgpu_5ft_14',['_float_to_FP8rowwise_gpu_t',['../namespacefbgemm__gpu.html#a6c5dca8da7ca5c5f89ecdc816745ba29',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5ffused8bitrowwise_5fcpu_5fout_15',['_float_to_fused8bitrowwise_cpu_out',['../group__quantize-data-cpu.html#gad38a9310258acccab8a017c1616034d0',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5ffused8bitrowwise_5fcpu_5fout_5ft_16',['_float_to_fused8bitrowwise_cpu_out_t',['../namespacefbgemm__gpu.html#a7f58b5ea1ea6cd38a42f73e5d688bb2c',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5ffused8bitrowwise_5fgpu_17',['_float_to_fused8bitrowwise_gpu',['../group__quantize-ops-cuda.html#ga8c11c8dc06cae57b3afba79358c00e99',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5ffused8bitrowwise_5fgpu_5ft_18',['_float_to_fused8bitrowwise_gpu_t',['../namespacefbgemm__gpu.html#a16bbb8557f4229489d966bb1d11bd00c',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5ffusednbitrowwise_5fcpu_19',['_float_to_fusednbitrowwise_cpu',['../namespacefbgemm__gpu.html#a29553ad77238659bb86c14842103d1d5',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5ffusednbitrowwise_5fgpu_20',['_float_to_fusednbitrowwise_gpu',['../group__quantize-ops-cuda.html#gaa3e8fd136e9bfa0e4d0c0016659bf708',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5ffusednbitrowwise_5fgpu_5ft_21',['_float_to_fusednbitrowwise_gpu_t',['../group__quantize-ops-cuda.html#ga02c8f9158646d9b16efbd3853711f56a',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5fhfp8_5fcpu_22',['_float_to_hfp8_cpu',['../namespacefbgemm__gpu.html#a70e9b9692aae9789f0a3804b9d12efe5',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5fhfp8_5fgpu_23',['_float_to_hfp8_gpu',['../group__quantize-ops-cuda.html#gab2837424e3774fe34ba255658554a75a',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5fmsfp_5fgpu_24',['_float_to_msfp_gpu',['../group__quantize-ops-cuda.html#ga427f81e1d8901e2fafc9611860fbd4d5',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5fpaddedfp8rowwise_5fgpu_25',['_float_to_paddedFP8rowwise_gpu',['../group__quantize-ops-cuda.html#ga5043927653e4d50462b79b7f3df33223',1,'fbgemm_gpu']]], + ['_5ffloat_5fto_5fpaddedfp8rowwise_5fgpu_5ft_26',['_float_to_paddedFP8rowwise_gpu_t',['../namespacefbgemm__gpu.html#a1d80140f030f2ca22fd14560e2d8aa42',1,'fbgemm_gpu']]], + ['_5ffp8rowwise_5fto_5ffloat_5fgpu_27',['_FP8rowwise_to_float_gpu',['../namespacefbgemm__gpu.html#a70d90c85fad4384b23c8958a6c300ce2',1,'fbgemm_gpu']]], + ['_5ffp8rowwise_5fto_5ffloat_5fgpu_5ft_28',['_FP8rowwise_to_float_gpu_t',['../namespacefbgemm__gpu.html#ac8931bd574641641dc69eadaae32efe3',1,'fbgemm_gpu']]], + ['_5ffused8bitrowwise_5fto_5ffloat_5fcpu_5fout_29',['_fused8bitrowwise_to_float_cpu_out',['../group__quantize-data-cpu.html#gabeb6675833a5b14e0a0d01385770a771',1,'fbgemm_gpu']]], + ['_5ffused8bitrowwise_5fto_5ffloat_5fcpu_5fout_5ft_30',['_fused8bitrowwise_to_float_cpu_out_t',['../namespacefbgemm__gpu.html#acc6b77e9be7ff8c2e5f16297fa6fad38',1,'fbgemm_gpu']]], + ['_5ffused8bitrowwise_5fto_5ffloat_5fgpu_31',['_fused8bitrowwise_to_float_gpu',['../namespacefbgemm__gpu.html#aab093a380068925d1b267452a1e255c2',1,'fbgemm_gpu']]], + ['_5ffused8bitrowwise_5fto_5ffloat_5fgpu_5ft_32',['_fused8bitrowwise_to_float_gpu_t',['../namespacefbgemm__gpu.html#a25d0793a9d1fe66bccad409791738b7b',1,'fbgemm_gpu']]], + ['_5ffused8bitrowwise_5fto_5ffloat_5fmixed_5fdim_5fgpu_33',['_fused8bitrowwise_to_float_mixed_dim_gpu',['../group__quantize-ops-cuda.html#ga4c2c033e940095d20e76e9e00fe925d3',1,'fbgemm_gpu']]], + ['_5ffused8bitrowwise_5fto_5fhalf_5fgpu_34',['_fused8bitrowwise_to_half_gpu',['../namespacefbgemm__gpu.html#a3aa2e594cf4bbb5cb5241c4eaa593f8a',1,'fbgemm_gpu']]], + ['_5ffused8bitrowwise_5fto_5fsingle_5for_5fhalf_5fprecision_5fgpu_35',['_fused8bitrowwise_to_single_or_half_precision_gpu',['../group__quantize-ops-cuda.html#gafacdb4ec7d8f5b969c75d2127537ab16',1,'fbgemm_gpu']]], + ['_5ffusednbitrowwise_5fto_5ffloat_5fcpu_36',['_fusednbitrowwise_to_float_cpu',['../namespacefbgemm__gpu.html#aa6141e72712885a0c89d74829be2fe6a',1,'fbgemm_gpu']]], + ['_5ffusednbitrowwise_5fto_5ffloat_5fgpu_37',['_fusednbitrowwise_to_float_gpu',['../namespacefbgemm__gpu.html#ae0193dd7bbb4e72fc977330cc3f019a4',1,'fbgemm_gpu']]], + ['_5ffusednbitrowwise_5fto_5ffloat_5fgpu_5ft_38',['_fusednbitrowwise_to_float_gpu_t',['../group__quantize-ops-cuda.html#gae1e827b74f0825dc4135e68c10e443b3',1,'fbgemm_gpu']]], + ['_5ffusednbitrowwise_5fto_5ffloat_5for_5fhalf_5fgpu_39',['_fusednbitrowwise_to_float_or_half_gpu',['../group__quantize-ops-cuda.html#ga07f4c02c95710472b815bdc1d7bfff19',1,'fbgemm_gpu']]], + ['_5ffusednbitrowwise_5fto_5fhalf_5fgpu_40',['_fusednbitrowwise_to_half_gpu',['../group__quantize-ops-cuda.html#ga6152517943258bd3adc42b7c103a9277',1,'fbgemm_gpu']]], + ['_5fgeneric_5fhistogram_5fbinning_5fcalibration_5fby_5ffeature_5fcpu_5fkernel_41',['_generic_histogram_binning_calibration_by_feature_cpu_kernel',['../namespacefbgemm__gpu.html#accd75a24d809f4322a18bfb12f47b343',1,'fbgemm_gpu']]], + ['_5fhalf_5fto_5ffused8bitrowwise_5fcpu_5fout_42',['_half_to_fused8bitrowwise_cpu_out',['../namespacefbgemm__gpu.html#a23bfcbc4afa5dd7d35ee03b7f23840a9',1,'fbgemm_gpu']]], + ['_5fhalf_5fto_5ffused8bitrowwise_5fgpu_43',['_half_to_fused8bitrowwise_gpu',['../namespacefbgemm__gpu.html#adfeb2fc956b7aa5c2446a00ccbcd058e',1,'fbgemm_gpu']]], + ['_5fhalf_5fto_5ffusednbitrowwise_5fgpu_44',['_half_to_fusednbitrowwise_gpu',['../group__quantize-ops-cuda.html#ga6e2bd64f3f9e3b36493ec955680771af',1,'fbgemm_gpu']]], + ['_5fhfp8_5fto_5ffloat_5fcpu_45',['_hfp8_to_float_cpu',['../namespacefbgemm__gpu.html#aaa8438f606e84d5cb07827759163bec6',1,'fbgemm_gpu']]], + ['_5fhfp8_5fto_5ffloat_5fgpu_46',['_hfp8_to_float_gpu',['../group__quantize-ops-cuda.html#ga03a8f8825a16c6235b699886fa46e1f6',1,'fbgemm_gpu']]], + ['_5fhistogram_5fbinning_5fcalibration_5fby_5ffeature_5fcpu_5fkernel_47',['_histogram_binning_calibration_by_feature_cpu_kernel',['../namespacefbgemm__gpu.html#adce89aa38a4a22058ec42b5077bbe23a',1,'fbgemm_gpu']]], + ['_5fhistogram_5fbinning_5fcalibration_5fcpu_5fkernel_48',['_histogram_binning_calibration_cpu_kernel',['../namespacefbgemm__gpu.html#a7639f61a587aa5052c488fbd00d3784b',1,'fbgemm_gpu']]], + ['_5finvert_5fpermute_5fcpu_5fkernel_49',['_invert_permute_cpu_kernel',['../namespacefbgemm__gpu.html#a7a8e9e91365de25b995833c08eb32eff',1,'fbgemm_gpu']]], + ['_5fmsfp_5fto_5ffloat_5fgpu_50',['_msfp_to_float_gpu',['../group__quantize-ops-cuda.html#gac0c20377454dbfafcc5ac245fe6427ce',1,'fbgemm_gpu']]], + ['_5fpaddedfp8rowwise_5fto_5ffloat_5fgpu_51',['_paddedFP8rowwise_to_float_gpu',['../namespacefbgemm__gpu.html#afc30bb56977528d8a85e43f9aa5c2cf8',1,'fbgemm_gpu']]], + ['_5fpaddedfp8rowwise_5fto_5ffloat_5fgpu_5ft_52',['_paddedFP8rowwise_to_float_gpu_t',['../namespacefbgemm__gpu.html#a0c0b93e239757d9564c51f8922f17554',1,'fbgemm_gpu']]], + ['_5fpermute_5f1d_5findices_5fweights_5fkernel_5fcpu_53',['_permute_1D_indices_weights_kernel_cpu',['../namespacefbgemm__gpu.html#af0e07ade6f2b89bf71c344aac8106b59',1,'fbgemm_gpu']]], + ['_5fpermute_5f1d_5flengths_5fcpu_5fkernel_54',['_permute_1D_lengths_cpu_kernel',['../namespacefbgemm__gpu.html#a8dfcdb2c902cf1c4e5d0ed916d5fe779',1,'fbgemm_gpu']]], + ['_5fpermute_5f2d_5findices_5fweights_5fkernel_5fcpu_55',['_permute_2D_indices_weights_kernel_cpu',['../namespacefbgemm__gpu.html#acad68edeefe7a7710f729cdc56876851',1,'fbgemm_gpu']]], + ['_5fpermute_5f2d_5flengths_5fcpu_5fkernel_56',['_permute_2D_lengths_cpu_kernel',['../namespacefbgemm__gpu.html#a72c447e3b6d38b548d89ebc464e2d469',1,'fbgemm_gpu']]], + ['_5fpermute_5fdata_5fkernel_5fcpu_57',['_permute_data_kernel_cpu',['../namespacefbgemm__gpu.html#a2fb715b347e075f3331083905cdaadfb',1,'fbgemm_gpu']]], + ['_5fpermute_5fembeddings_5fkernel_5fcpu_58',['_permute_embeddings_kernel_cpu',['../namespacefbgemm__gpu.html#a6987e1403a25c256168873616dffbdf6',1,'fbgemm_gpu']]], + ['_5fpermute_5flengths_5fcpu_5fkernel_59',['_permute_lengths_cpu_kernel',['../namespacefbgemm__gpu.html#a4c7749afd2c661b1d302268035fde42b',1,'fbgemm_gpu']]], + ['_5fsegment_5fsum_5fcsr_5fcpu_5fkernel_60',['_segment_sum_csr_cpu_kernel',['../namespacefbgemm__gpu.html#ade08c8b174b0ecbb99d01ad87b4da0b3',1,'fbgemm_gpu']]], + ['_5fsingle_5for_5fhalf_5fprecision_5fto_5ffused8bitrowwise_5fgpu_61',['_single_or_half_precision_to_fused8bitrowwise_gpu',['../group__quantize-ops-cuda.html#gaff285349cb9c51a56fc418b628772b16',1,'fbgemm_gpu']]], + ['_5fupdate_5fkernel_62',['_update_kernel',['../embedding__optimizer__split__kernel__template_8cu.html#afab484072b9b8381500b14e31ba49364',1,'_update_kernel(at::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > dev_weights, at::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > uvm_weights, at::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const at::PackedTensorAccessor32< emb_t, 1, at::RestrictPtrTraits > grad_dev_weights, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_dev_indices, const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const int32_t max_D, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, {{ args.split_kernel_args|join(", ") }}): embedding_optimizer_split_kernel_template.cu'],['../embedding__optimizer__split__template_8cu.html#afab484072b9b8381500b14e31ba49364',1,'_update_kernel(at::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > dev_weights, at::PackedTensorAccessor64< emb_t, 1, at::RestrictPtrTraits > uvm_weights, at::PackedTensorAccessor64< cache_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const at::PackedTensorAccessor32< emb_t, 1, at::RestrictPtrTraits > grad_dev_weights, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_dev_indices, const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const at::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const int32_t max_D, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, {{ args.split_kernel_args|join(", ") }}): embedding_optimizer_split_kernel_template.cu']]], + ['_5fv2_5fkernel_63',['_v2_kernel',['../embedding__forward__split__kernel__v2__template_8cu.html#a20b736346ad19821ed9748c4dde5b058',1,'embedding_forward_split_kernel_v2_template.cu']]] ]; diff --git a/search/functions_1.js b/search/functions_1.js index 81fb97c28..015f6c47c 100644 --- a/search/functions_1.js +++ b/search/functions_1.js @@ -1,5 +1,32 @@ var searchData= [ - ['direct_5fmapped_5flru_5fcache_5fpopulate_5fbyte_5fcuda_0',['direct_mapped_lru_cache_populate_byte_cuda',['../group__table-batched-embed-cuda.html#gae019b6879bd9f89a146e0700d5a4bd8b',1,'split_embeddings_cache_cuda.cuh']]], - ['direct_5fmapped_5flxu_5fcache_5flookup_5fcuda_1',['direct_mapped_lxu_cache_lookup_cuda',['../group__table-batched-embed-cuda.html#gab305ebdd3822794c5ac462bf5df4bb49',1,'split_embeddings_cache_cuda.cuh']]] + ['accumulate_5ffp16_0',['accumulate_fp16',['../namespacefbgemm__gpu.html#a3de0ed0985acc3edc0583b6cd56a43f2',1,'fbgemm_gpu']]], + ['accumulate_5ffp32_1',['accumulate_fp32',['../namespacefbgemm__gpu.html#aeb3ef6437b744f52b29910361f83336c',1,'fbgemm_gpu']]], + ['accumulate_5fpacked_5fhfp8_2',['accumulate_packed_hfp8',['../namespacefbgemm__gpu.html#acc596fdaac7efc925d19d7374251e8cb',1,'fbgemm_gpu']]], + ['accumulate_5fpacked_5fint2_3',['accumulate_packed_int2',['../namespacefbgemm__gpu.html#a857c58d8bfc412a3901414ef0b0f73c5',1,'fbgemm_gpu']]], + ['accumulate_5fpacked_5fint4_4',['accumulate_packed_int4',['../namespacefbgemm__gpu.html#af3478ab6f636e80a75953ffc1d8caed9',1,'fbgemm_gpu']]], + ['accumulate_5fpacked_5fint8_5',['accumulate_packed_int8',['../namespacefbgemm__gpu.html#a24c22ef27a441cb888d3b32957588794',1,'fbgemm_gpu']]], + ['accumulate_5fweighted_5ffp16_6',['accumulate_weighted_fp16',['../namespacefbgemm__gpu.html#a2700bcf99c82f2491a174d51c462e4e8',1,'fbgemm_gpu']]], + ['accumulate_5fweighted_5ffp32_7',['accumulate_weighted_fp32',['../namespacefbgemm__gpu.html#a7225f36d3ef25f69273160500bd0b9a7',1,'fbgemm_gpu']]], + ['accumulate_5fweighted_5fpacked_5fhfp8_8',['accumulate_weighted_packed_hfp8',['../namespacefbgemm__gpu.html#aa177a98d987438afcde04f7fc2cba71a',1,'fbgemm_gpu']]], + ['accumulate_5fweighted_5fpacked_5fint2_9',['accumulate_weighted_packed_int2',['../namespacefbgemm__gpu.html#aebe17b37f24d82ea8cfbd296e307d5ab',1,'fbgemm_gpu']]], + ['accumulate_5fweighted_5fpacked_5fint4_10',['accumulate_weighted_packed_int4',['../namespacefbgemm__gpu.html#ade03f1b4099c9ecaf38d7d6a0eb7d595',1,'fbgemm_gpu']]], + ['accumulate_5fweighted_5fpacked_5fint8_11',['accumulate_weighted_packed_int8',['../namespacefbgemm__gpu.html#a80d2d456b1c87f68c9098d5e5d1fd47d',1,'fbgemm_gpu']]], + ['add_12',['add',['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#a3421b900475f40701fb4c0c1c542744c',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::add()'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a5686a6ec8884ddf2ad633d735d181011',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::add()'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#ac26f750f3fa72d8b137026cc8726972f',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::add()'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#a4909df6d879ffbb0e234114609ce3000',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::add()'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#a4909df6d879ffbb0e234114609ce3000',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::add()'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#a4909df6d879ffbb0e234114609ce3000',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::add()'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#a827812cf195008164049b47d4fc9efc1',1,'fbgemm_gpu::Vec4AccT::add(const float4 *ptr)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#a533e0b7fe298fd776f58607d9f67bda1',1,'fbgemm_gpu::Vec4AccT::add(const float2 *ptr)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#a14f0714a4e51293efb99e3d6815be3a2',1,'fbgemm_gpu::Vec4AccT::add(const uint8_t *ptr)']]], + ['add_5f_13',['add_',['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#af3cbc396133203521c050935239eebe2',1,'fbgemm_gpu::Vec4T< float >::add_()'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#af3cbc396133203521c050935239eebe2',1,'fbgemm_gpu::Vec4T< at::Half >::add_(const Vec4T< float > &a)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a3f8a7e8e00c59205f3b32b345290922b',1,'fbgemm_gpu::Vec4T< at::Half >::add_(const Vec4T< at::Half > &a)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#af3cbc396133203521c050935239eebe2',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::add_(const Vec4T< float > &a)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a3f8a7e8e00c59205f3b32b345290922b',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::add_(const Vec4T< at::Half > &a)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#ae0cdda7691531bfb7975dad742ff3984',1,'fbgemm_gpu::Vec4T< double >::add_()'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#a746ed2bbabd0878f33b478c587bde0cf',1,'fbgemm_gpu::Vec4AccT::add_(const float *vals)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#a5eebdd38332484343d4400fd08f3b549',1,'fbgemm_gpu::Vec4AccT::add_(const half2 *vals_h)']]], + ['adjust_5finfo_5fb_5fnum_5fbits_14',['adjust_info_B_num_bits',['../split__embeddings__utils_8cuh.html#aaaa05e63829893f17b951de7dc993747',1,'adjust_info_B_num_bits(int32_t B, int32_t T): get_infos_metadata.cu'],['../get__infos__metadata_8cu.html#a315ee6fa620a68c902298d741ac8989d',1,'adjust_info_B_num_bits(int32_t B, int32_t T): get_infos_metadata.cu']]], + ['adjust_5foffset_5fkernel_15',['adjust_offset_kernel',['../embedding__bounds__check_8cu.html#af9e26c2f2d6dfef45e1a12507d8c2b72',1,'embedding_bounds_check.cu']]], + ['all_5fto_5fone_5fdevice_16',['all_to_one_device',['../group__merge-pooled-emb.html#ga3933c7465129b58edd60ffcc1999c223',1,'fbgemm_gpu']]], + ['assign_17',['assign',['../namespacefbgemm__gpu.html#a6e69d027d43eb7e92ea620d43ae43cb1',1,'fbgemm_gpu']]], + ['asynchronous_5fcomplete_5fcumsum_18',['asynchronous_complete_cumsum',['../transpose__embedding__input_8cu.html#ae27e2b1fda2a338ce8f7f2207b580e7f',1,'transpose_embedding_input.cu']]], + ['asynchronous_5fcomplete_5fcumsum_5fcpu_19',['asynchronous_complete_cumsum_cpu',['../namespacefbgemm__gpu.html#a98effac974dc3fe5bbcc4ce8a75578f7',1,'fbgemm_gpu']]], + ['asynchronous_5fcomplete_5fcumsum_5fgpu_20',['asynchronous_complete_cumsum_gpu',['../namespacefbgemm__gpu.html#a1f31ee9922c98ad5d013361368f2f5ac',1,'fbgemm_gpu']]], + ['asynchronous_5fcomplete_5fcumsum_5fmeta_21',['asynchronous_complete_cumsum_meta',['../namespacefbgemm__gpu.html#a656bb5222f2a0bc92d5b895ba0fa846c',1,'fbgemm_gpu']]], + ['asynchronous_5fexclusive_5fcumsum_5fcpu_22',['asynchronous_exclusive_cumsum_cpu',['../namespacefbgemm__gpu.html#a69fe5be794026bdb73b0196be9b345a4',1,'fbgemm_gpu']]], + ['asynchronous_5fexclusive_5fcumsum_5fgpu_23',['asynchronous_exclusive_cumsum_gpu',['../namespacefbgemm__gpu.html#afd8b0919b5b3b021a8eb3727e304d5b4',1,'fbgemm_gpu']]], + ['asynchronous_5fexclusive_5fcumsum_5fmeta_24',['asynchronous_exclusive_cumsum_meta',['../namespacefbgemm__gpu.html#ae96f1ffdb8ed1efd58561364fbaf3c6a',1,'fbgemm_gpu']]], + ['asynchronous_5finclusive_5fcumsum_5fcpu_25',['asynchronous_inclusive_cumsum_cpu',['../namespacefbgemm__gpu.html#a8930419ab36c85750182c12db95baa29',1,'fbgemm_gpu']]], + ['asynchronous_5finclusive_5fcumsum_5fgpu_26',['asynchronous_inclusive_cumsum_gpu',['../namespacefbgemm__gpu.html#acc0c0e7f6e816900474b2e52756ac891',1,'fbgemm_gpu']]], + ['at_27',['at',['../classfbgemm__gpu_1_1_tensor_accessor_base.html#a95ed732ddbdd788721e2c0fc17a3d8a0',1,'fbgemm_gpu::TensorAccessorBase::at()'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#a95ed732ddbdd788721e2c0fc17a3d8a0',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::at()']]], + ['auc_5fkernel_28',['auc_kernel',['../namespacefbgemm__gpu.html#a4bcadae3f465ece7979bf89f0c1cf22a',1,'fbgemm_gpu']]] ]; diff --git a/search/functions_10.js b/search/functions_10.js new file mode 100644 index 000000000..9a5cef712 --- /dev/null +++ b/search/functions_10.js @@ -0,0 +1,61 @@ +var searchData= +[ + ['pack_5fsegments_5fautograd_0',['pack_segments_autograd',['../namespacefbgemm__gpu.html#a24fd2f4efa543ea716010c3fc1832587',1,'fbgemm_gpu']]], + ['pack_5fsegments_5fbackward_5fcpu_1',['pack_segments_backward_cpu',['../namespacefbgemm__gpu.html#a51f0921a8e934c6c4d0fca5ebb5d8338',1,'fbgemm_gpu']]], + ['pack_5fsegments_5fbackward_5fcuda_2',['pack_segments_backward_cuda',['../namespacefbgemm__gpu.html#aaded8e25bef3a32580d71dc2ead25f0c',1,'fbgemm_gpu']]], + ['pack_5fsegments_5fcpu_3',['pack_segments_cpu',['../namespacefbgemm__gpu.html#a01151883c1840f280f4f9c083677c8b5',1,'fbgemm_gpu']]], + ['pack_5fsegments_5fcuda_4',['pack_segments_cuda',['../namespacefbgemm__gpu.html#a049c248a78797b27f5e053809c13b88e',1,'fbgemm_gpu']]], + ['pack_5fsegments_5fcuda_5fkernel_5',['pack_segments_cuda_kernel',['../namespacefbgemm__gpu.html#a3ff1eed5a38a10b4da916f9ec154f225',1,'fbgemm_gpu']]], + ['pack_5fsegments_5fforward_5fcpu_6',['pack_segments_forward_cpu',['../namespacefbgemm__gpu.html#a49cb5dd543cc63e932f458e1c79c0d00',1,'fbgemm_gpu']]], + ['pack_5fsegments_5fforward_5fcuda_7',['pack_segments_forward_cuda',['../namespacefbgemm__gpu.html#a4bec138cb5be2583288d026eb4185646',1,'fbgemm_gpu']]], + ['padded_5fd_8',['padded_D',['../namespacenbit.html#a45a36e2eb0376c3e37728ea312851cd7',1,'nbit']]], + ['padded_5frow_5fsize_5fin_5fbytes_9',['padded_row_size_in_bytes',['../namespacenbit.html#a3ac5bf25115544f9067032bef644a215',1,'nbit']]], + ['padding_5ffused_5ftbe_5finput_5fcombine_5fcpu_10',['padding_fused_tbe_input_combine_cpu',['../group__input-combine.html#ga9ab60fbe75053c2f31f7d3f16dfa476f',1,'fbgemm_gpu']]], + ['padding_5ffused_5ftbe_5finput_5fcombine_5fwith_5flength_5fcpu_11',['padding_fused_tbe_input_combine_with_length_cpu',['../namespacefbgemm__gpu.html#af01b4023830652f0cc3e99c87f7b4526',1,'fbgemm_gpu']]], + ['permute102_5fbaddbmm_5fpermute102_5fcpu_12',['permute102_baddbmm_permute102_cpu',['../namespacefbgemm__gpu.html#ab8d862f0ffee51a4d276f3989f0ab24b',1,'fbgemm_gpu']]], + ['permute102_5fbaddbmm_5fpermute102_5fcuda_13',['permute102_baddbmm_permute102_cuda',['../namespacefbgemm__gpu.html#a0c3f53164eb98c0b45b5aaef3e99a172',1,'fbgemm_gpu']]], + ['permute_5f1d_5fsparse_5fdata_5fcpu_14',['permute_1D_sparse_data_cpu',['../namespacefbgemm__gpu.html#a22758d46158e49801e876ab269855736',1,'fbgemm_gpu']]], + ['permute_5f2d_5fsparse_5fdata_5fcpu_15',['permute_2D_sparse_data_cpu',['../namespacefbgemm__gpu.html#a83da584464d49a223941e4b926b9676a',1,'fbgemm_gpu']]], + ['permute_5fduplicate_5fpooled_5fembs_5fauto_5fgrad_5fcpu_16',['permute_duplicate_pooled_embs_auto_grad_cpu',['../namespacefbgemm__gpu.html#aeabdb24bef8b30a2b80b94a676b2b5fb',1,'fbgemm_gpu']]], + ['permute_5fduplicate_5fpooled_5fembs_5fauto_5fgrad_5fgpu_17',['permute_duplicate_pooled_embs_auto_grad_gpu',['../namespacefbgemm__gpu.html#a242a088c94da1f0b016087bef8460622',1,'fbgemm_gpu']]], + ['permute_5fduplicate_5fpooled_5fembs_5fauto_5fgrad_5fsplit_5fcpu_18',['permute_duplicate_pooled_embs_auto_grad_split_cpu',['../namespacefbgemm__gpu.html#af0cdb20f76a1c62644ad644e4c7210ad',1,'fbgemm_gpu']]], + ['permute_5fduplicate_5fpooled_5fembs_5fauto_5fgrad_5fsplit_5fgpu_19',['permute_duplicate_pooled_embs_auto_grad_split_gpu',['../namespacefbgemm__gpu.html#a276c76fa5487668edb8477a844ca1704',1,'fbgemm_gpu']]], + ['permute_5fduplicate_5fpooled_5fembs_5fcpu_20',['permute_duplicate_pooled_embs_cpu',['../namespacefbgemm__gpu.html#acc5af8d2639bda183a7758a7fb4d4e9a',1,'fbgemm_gpu']]], + ['permute_5fduplicate_5fpooled_5fembs_5fgpu_21',['permute_duplicate_pooled_embs_gpu',['../namespacefbgemm__gpu.html#aecf7e9c2b36bb349c98294b9abfcf7c1',1,'fbgemm_gpu']]], + ['permute_5fduplicate_5fpooled_5fembs_5fsplit_5fcpu_22',['permute_duplicate_pooled_embs_split_cpu',['../namespacefbgemm__gpu.html#a286571e933b530189672faaa53ee20e6',1,'fbgemm_gpu']]], + ['permute_5fduplicate_5fpooled_5fembs_5fsplit_5fgpu_23',['permute_duplicate_pooled_embs_split_gpu',['../namespacefbgemm__gpu.html#a34e792da7d58bd96fc1c9d4c0b1b3a2a',1,'fbgemm_gpu']]], + ['permute_5fembeddings_5fkernel_24',['permute_embeddings_kernel',['../namespacefbgemm__gpu.html#a2b00efff9050b6bec363081afc5c3c2f',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fauto_5fgrad_25',['permute_pooled_embs_auto_grad',['../group__permute-pooled-embs-cpu.html#ga3fd0766d863a18ea5cce4bfdef6a0349',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fauto_5fgrad_5fcpu_26',['permute_pooled_embs_auto_grad_cpu',['../group__permute-pooled-embs-cpu.html#gac050c22198470709b89b4d5b160006b0',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fauto_5fgrad_5fgpu_27',['permute_pooled_embs_auto_grad_gpu',['../group__permute-pooled-embs-gpu.html#gad0d8a6f85fc81bc54e4c20e60fe6eb11',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fauto_5fgrad_5fmeta_28',['permute_pooled_embs_auto_grad_meta',['../namespacefbgemm__gpu.html#a4381e6e500aad1cf049aa509fc17b16b',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fauto_5fgrad_5fsplit_5fcpu_29',['permute_pooled_embs_auto_grad_split_cpu',['../group__permute-pooled-embs-cpu.html#ga62bb71eb3e7a980ce5efded317717189',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fauto_5fgrad_5fsplit_5fgpu_30',['permute_pooled_embs_auto_grad_split_gpu',['../group__permute-pooled-embs-gpu.html#gab5673b48b58896e4954cc8fc7c90c4d8',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fcpu_31',['permute_pooled_embs_cpu',['../namespacefbgemm__gpu.html#aa321302401045119810e93f42a361f1f',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fcpu_5fimpl_32',['permute_pooled_embs_cpu_impl',['../group__permute-pooled-embs-cpu.html#ga39797562608b1226fc1632f815f7d8a2',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fgpu_33',['permute_pooled_embs_gpu',['../namespacefbgemm__gpu.html#a9b4a18abd526ab3e9c95f782d87afbbb',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fgpu_5fimpl_34',['permute_pooled_embs_gpu_impl',['../namespacefbgemm__gpu.html#aca0e73083114d9eea99129e54b89fa23',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fkernel_35',['permute_pooled_embs_kernel',['../layout__transform__ops_8cuh.html#acf1671783450ed8e673d22cbc1d917b5',1,'layout_transform_ops.cuh']]], + ['permute_5fpooled_5fembs_5fmeta_36',['permute_pooled_embs_meta',['../namespacefbgemm__gpu.html#a1183d2ce4456d290df04c32b215fc22e',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fsplit_5fcpu_37',['permute_pooled_embs_split_cpu',['../group__permute-pooled-embs-cpu.html#ga21fd23f8f0de62159529356ebf7eb1f1',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fsplit_5fcpu_5fimpl_38',['permute_pooled_embs_split_cpu_impl',['../namespacefbgemm__gpu.html#a9ce974f08ff3cb46289f39af5ea7fcec',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fsplit_5fgpu_39',['permute_pooled_embs_split_gpu',['../group__permute-pooled-embs-gpu.html#ga342967f8cc4e25c7655d1987536cdc6b',1,'fbgemm_gpu']]], + ['permute_5fpooled_5fembs_5fsplit_5fgpu_5fimpl_40',['permute_pooled_embs_split_gpu_impl',['../namespacefbgemm__gpu.html#a0d587655a374b11bb6b7febcabe0f403',1,'fbgemm_gpu']]], + ['permute_5fsequence_5fembeddings_5fcpu_41',['permute_sequence_embeddings_cpu',['../namespacefbgemm__gpu.html#a6c601604b9a15b45176ad42d4ca04d7d',1,'fbgemm_gpu']]], + ['permute_5fsequence_5fembeddings_5fcuda_42',['permute_sequence_embeddings_cuda',['../namespacefbgemm__gpu.html#a713a7245a4295a57007802212dca05ee',1,'fbgemm_gpu']]], + ['permute_5fsparse_5ffeatures_5fcpu_43',['permute_sparse_features_cpu',['../namespacefbgemm__gpu.html#a7eec8c74f87d4204857061b761a17ede',1,'fbgemm_gpu']]], + ['prefix_5fsum_44',['prefix_sum',['../namespacefbgemm__gpu.html#a82c664395e6340a5878c867fcf278bfc',1,'fbgemm_gpu']]], + ['process_5fall_5findices_5flarge_5fls_45',['process_all_indices_large_Ls',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#ad4f576c80cbb86fce55f5420968bc826',1,'process_all_indices_large_Ls(long *const smem, const uint32_t L, const bool process_d, const bool mean_pooling, const uint32_t params_offset, const uint32_t max_D_cache): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ad4f576c80cbb86fce55f5420968bc826',1,'process_all_indices_large_Ls(long *const smem, const uint32_t L, const bool process_d, const bool mean_pooling, const uint32_t params_offset, const uint32_t max_D_cache): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#ad4f576c80cbb86fce55f5420968bc826',1,'process_all_indices_large_Ls(long *const smem, const uint32_t L, const bool process_d, const bool mean_pooling, const uint32_t params_offset, const uint32_t max_D_cache): embedding_forward_split_kernel_v2_template.cu']]], + ['process_5fall_5findices_5fno_5fpooling_46',['process_all_indices_no_pooling',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a02fb6083bc1f3a1c39dabb7818866a46',1,'process_all_indices_no_pooling(long *const smem, const bool process_d, const uint32_t params_offset): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a02fb6083bc1f3a1c39dabb7818866a46',1,'process_all_indices_no_pooling(long *const smem, const bool process_d, const uint32_t params_offset): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a02fb6083bc1f3a1c39dabb7818866a46',1,'process_all_indices_no_pooling(long *const smem, const bool process_d, const uint32_t params_offset): embedding_forward_split_kernel_v2_template.cu']]], + ['process_5fall_5findices_5fsmall_5fls_47',['process_all_indices_small_Ls',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a871fb6b516157e559e3ed26b56e4245c',1,'process_all_indices_small_Ls(long *const smem, const uint32_t total_L, const bool process_d, const bool mean_pooling, const uint32_t params_offset, const uint32_t max_D_cache): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a871fb6b516157e559e3ed26b56e4245c',1,'process_all_indices_small_Ls(long *const smem, const uint32_t total_L, const bool process_d, const bool mean_pooling, const uint32_t params_offset, const uint32_t max_D_cache): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a871fb6b516157e559e3ed26b56e4245c',1,'process_all_indices_small_Ls(long *const smem, const uint32_t total_L, const bool process_d, const bool mean_pooling, const uint32_t params_offset, const uint32_t max_D_cache): embedding_forward_split_kernel_v2_template.cu']]], + ['pruned_5farray_5flookup_5fcpu_48',['pruned_array_lookup_cpu',['../group__embedding-cpu.html#ga50d9da3c5bc1fe8b9cabfbda212c2ea5',1,'pruned_array_lookup_cpu(Tensor indices, Tensor offsets, Tensor index_remappings, Tensor index_remappings_offsets): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp'],['../group__embedding-cpu.html#ga50d9da3c5bc1fe8b9cabfbda212c2ea5',1,'pruned_array_lookup_cpu(Tensor indices, Tensor offsets, Tensor index_remappings, Tensor index_remappings_offsets): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp']]], + ['pruned_5farray_5flookup_5fcuda_49',['pruned_array_lookup_cuda',['../group__embedding-cuda.html#gaea1d3ae26d1e893ccf08f8b55b3d6eff',1,'pruned_array_lookup_cuda(Tensor indices, Tensor offsets, Tensor index_remappings, Tensor index_remappings_offsets): embedding_forward_quantized_split_lookup.cu'],['../group__embedding-cuda.html#gaea1d3ae26d1e893ccf08f8b55b3d6eff',1,'pruned_array_lookup_cuda(Tensor indices, Tensor offsets, Tensor index_remappings, Tensor index_remappings_offsets): embedding_forward_quantized_split_lookup.cu']]], + ['pruned_5farray_5flookup_5ffrom_5frow_5fidx_5fcpu_50',['pruned_array_lookup_from_row_idx_cpu',['../namespacefbgemm__gpu.html#ab57019812325465b62248776bb200885',1,'fbgemm_gpu']]], + ['pruned_5farray_5flookup_5ffrom_5frow_5fidx_5fcuda_51',['pruned_array_lookup_from_row_idx_cuda',['../namespacefbgemm__gpu.html#adda552b8784184a2f17aa997e10869f9',1,'fbgemm_gpu']]], + ['pruned_5fhash_5ffunction_52',['pruned_hash_function',['../namespacenbit.html#adf6ceb44691d377239880812db632ef7',1,'nbit']]], + ['pruned_5fhashmap_5finsert_5funweighted_5fcpu_53',['pruned_hashmap_insert_unweighted_cpu',['../group__embedding-cpu.html#ga5b5d3d94a399c14899a4410d1f5e7dad',1,'pruned_hashmap_insert_unweighted_cpu(Tensor indices, Tensor dense_indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp'],['../group__embedding-cpu.html#ga5b5d3d94a399c14899a4410d1f5e7dad',1,'pruned_hashmap_insert_unweighted_cpu(Tensor indices, Tensor dense_indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp']]], + ['pruned_5fhashmap_5finsert_5fweighted_5fcpu_54',['pruned_hashmap_insert_weighted_cpu',['../gen__embedding__forward__quantized__weighted__codegen__cpu_8cpp.html#a446403a1c26f7fecbc1c67fd9be87bf0',1,'gen_embedding_forward_quantized_weighted_codegen_cpu.cpp']]], + ['pruned_5fhashmap_5flookup_5fcuda_55',['pruned_hashmap_lookup_cuda',['../group__embedding-cuda.html#ga1adb0a98306b7d6f839b5fbcaaa44ec7',1,'pruned_hashmap_lookup_cuda(Tensor indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets): embedding_forward_quantized_split_lookup.cu'],['../group__embedding-cuda.html#ga1adb0a98306b7d6f839b5fbcaaa44ec7',1,'pruned_hashmap_lookup_cuda(Tensor indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets): embedding_forward_quantized_split_lookup.cu']]], + ['pruned_5fhashmap_5flookup_5funweighted_5fcpu_56',['pruned_hashmap_lookup_unweighted_cpu',['../group__embedding-cpu.html#ga2c64467f516cc9caf72cb94e9913b211',1,'pruned_hashmap_lookup_unweighted_cpu(Tensor indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp'],['../group__embedding-cpu.html#ga2c64467f516cc9caf72cb94e9913b211',1,'pruned_hashmap_lookup_unweighted_cpu(Tensor indices, Tensor offsets, Tensor hash_table, Tensor hash_table_offsets): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp']]], + ['pruned_5fhashmap_5flookup_5fweighted_5fcpu_57',['pruned_hashmap_lookup_weighted_cpu',['../gen__embedding__forward__quantized__weighted__codegen__cpu_8cpp.html#ae0d1d716d565d7e70bd253dcd89d7f47',1,'gen_embedding_forward_quantized_weighted_codegen_cpu.cpp']]] +]; diff --git a/search/functions_11.js b/search/functions_11.js new file mode 100644 index 000000000..95f55fdce --- /dev/null +++ b/search/functions_11.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['quantize_5fstore_0',['quantize_store',['../namespacefbgemm__gpu.html#af5bbc85156e52ab097bb0f770a2f63e7',1,'fbgemm_gpu']]] +]; diff --git a/search/functions_12.js b/search/functions_12.js new file mode 100644 index 000000000..d3b1b6222 --- /dev/null +++ b/search/functions_12.js @@ -0,0 +1,24 @@ +var searchData= +[ + ['recat_5fcopy_5fasync_5fkernel_0',['recat_copy_async_kernel',['../layout__transform__ops_8cuh.html#a2f3c62685f843be282e18a9805d8ad5c',1,'layout_transform_ops.cuh']]], + ['recat_5fembedding_5fgrad_5foutput_5fcuda_1',['recat_embedding_grad_output_cuda',['../group__layout-transform-cuda.html#ga09438223bb710af7f55fb6d25fc9d99f',1,'fbgemm_gpu']]], + ['recat_5fembedding_5fgrad_5foutput_5fmixed_5fd_5fbatch_5fcuda_2',['recat_embedding_grad_output_mixed_D_batch_cuda',['../group__layout-transform-cuda.html#gad5cabc0ba0ee6dfd8a8de4e5825c62e9',1,'fbgemm_gpu']]], + ['recat_5fembedding_5fgrad_5foutput_5fmixed_5fd_5fcpu_3',['recat_embedding_grad_output_mixed_D_cpu',['../group__layout-transform-cpu.html#ga8edc2bee42577b7eeb76613b52d62311',1,'fbgemm_gpu']]], + ['recat_5fembedding_5fgrad_5foutput_5fmixed_5fd_5fcuda_4',['recat_embedding_grad_output_mixed_D_cuda',['../group__layout-transform-cuda.html#gaf753887183c2603a01978463228a0343',1,'fbgemm_gpu']]], + ['reorder_5fbatched_5fad_5findices_5fcpu_5',['reorder_batched_ad_indices_cpu',['../namespacefbgemm__gpu.html#a71657f0dff28b74e6cb71f2e70adba96',1,'fbgemm_gpu']]], + ['reorder_5fbatched_5fad_5findices_5fcpu_5f_6',['reorder_batched_ad_indices_cpu_',['../namespacefbgemm__gpu.html#abe2eef805cfc20b2d3ba69e3db973688',1,'fbgemm_gpu']]], + ['reorder_5fbatched_5fad_5findices_5fgpu_7',['reorder_batched_ad_indices_gpu',['../namespacefbgemm__gpu.html#a10ae2e750abd260fb3dc2deb5e6a10a6',1,'fbgemm_gpu']]], + ['reorder_5fbatched_5fad_5flengths_5f_8',['reorder_batched_ad_lengths_',['../namespacefbgemm__gpu.html#a87472f171b785c3735bc88d72c8ddd9e',1,'fbgemm_gpu']]], + ['reorder_5fbatched_5fad_5flengths_5fcpu_9',['reorder_batched_ad_lengths_cpu',['../namespacefbgemm__gpu.html#aee6a046b2315137787cced8d9942a248',1,'fbgemm_gpu']]], + ['reorder_5fbatched_5fad_5flengths_5fgpu_10',['reorder_batched_ad_lengths_gpu',['../namespacefbgemm__gpu.html#af398efd1fa34f78e6882f7691aa99fa9',1,'fbgemm_gpu']]], + ['report_5fembedding_5ferror_11',['report_embedding_error',['../namespacefbgemm__gpu.html#a17e57fc2dca2d6df09e26f3eec69464c',1,'fbgemm_gpu']]], + ['reset_12',['reset',['../structfbgemm__gpu_1_1_vec4_acc_t.html#a290527af29e033f3ed6f5464ded1b07e',1,'fbgemm_gpu::Vec4AccT']]], + ['reset_5fweight_5fmomentum_5fcuda_13',['reset_weight_momentum_cuda',['../group__table-batched-embed-cuda.html#ga59334fdad832f8d67576e6c83a9b9d79',1,'reset_weight_momentum_cuda(at::Tensor dev_weights, at::Tensor uvm_weights, at::Tensor lxu_cache_weights, at::Tensor weights_placements, at::Tensor weights_offsets, at::Tensor momentum1_dev, at::Tensor momentum1_uvm, at::Tensor momentum1_placements, at::Tensor momentum1_offsets, at::Tensor D_offsets, at::Tensor pruned_indices, at::Tensor pruned_indices_offsets, at::Tensor logical_table_ids, at::Tensor buffer_ids, at::Tensor cache_hash_size_cumsum, at::Tensor lxu_cache_state, int64_t total_cache_hash_size): reset_weight_momentum.cu'],['../group__table-batched-embed-cuda.html#ga59334fdad832f8d67576e6c83a9b9d79',1,'reset_weight_momentum_cuda(Tensor dev_weights, Tensor uvm_weights, Tensor lxu_cache_weights, Tensor weights_placements, Tensor weights_offsets, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor D_offsets, Tensor pruned_indices, Tensor pruned_indices_offsets, Tensor logical_table_ids, Tensor buffer_ids, Tensor cache_hash_size_cumsum, Tensor lxu_cache_state, int64_t total_cache_hash_size): reset_weight_momentum.cu']]], + ['rk_5fdouble_14',['rk_double',['../namespacefbgemm__gpu.html#af9dc4afe0a87b2326caf53649eee20eb',1,'fbgemm_gpu']]], + ['rk_5frandom_15',['rk_random',['../namespacefbgemm__gpu.html#a3914fbd6fed76ebe8d05a1967ec5ccb9',1,'fbgemm_gpu']]], + ['rk_5fseed_16',['rk_seed',['../namespacefbgemm__gpu.html#ad56b0e8dd76a57dcc1e268831fe58abb',1,'fbgemm_gpu']]], + ['rk_5fzipf_17',['rk_zipf',['../namespacefbgemm__gpu.html#ac4468c32ea6dc23cc2d7bded57a53119',1,'fbgemm_gpu']]], + ['round_5fdown_18',['round_down',['../namespacefbgemm__gpu.html#afad69123afbd407f6cd94913da47680e',1,'fbgemm_gpu']]], + ['round_5fup_19',['round_up',['../namespacenbit.html#a3f668dd605c2700542424899b9df54c6',1,'nbit']]], + ['run_5femulate_5fcache_5fmiss_20',['run_emulate_cache_miss',['../uvm__cache__miss__emulate__test_8cpp.html#ac9959da4e8495e9b74415473535a9c3e',1,'uvm_cache_miss_emulate_test.cpp']]] +]; diff --git a/search/functions_13.js b/search/functions_13.js new file mode 100644 index 000000000..85b1ae634 --- /dev/null +++ b/search/functions_13.js @@ -0,0 +1,139 @@ +var searchData= +[ + ['segment_5fsum_5fcsr_5fcpu_0',['segment_sum_csr_cpu',['../namespacefbgemm__gpu.html#a678327561759694192908f1f111424f7',1,'fbgemm_gpu']]], + ['segment_5fsum_5fcsr_5fcuda_1',['segment_sum_csr_cuda',['../namespacefbgemm__gpu.html#a8ae9711da44e5cd4a81f95a762b41180',1,'fbgemm_gpu']]], + ['set_2',['set',['../classssd_1_1_embedding_rocks_d_b.html#a1951c5647b663fc955ee1076f68190ec',1,'ssd::EmbeddingRocksDB']]], + ['set_5fcuda_3',['set_cuda',['../classssd_1_1_embedding_rocks_d_b.html#a1b6c5343b7eafae73491f0749f1151a9',1,'ssd::EmbeddingRocksDB']]], + ['set_5fstochastic_5frounding_4',['set_stochastic_rounding',['../structfbgemm__gpu_1_1_weight_row.html#a4548dbb10be8705cf81e3e2362f1cea3',1,'fbgemm_gpu::WeightRow']]], + ['shfl_5fdown_5fsync_5',['shfl_down_sync',['../namespacefbgemm__gpu.html#a52eb62356a603284f18652bc195274ea',1,'fbgemm_gpu']]], + ['shfl_5fsync_6',['shfl_sync',['../namespacefbgemm__gpu.html#a9b3fcf49a28b6524c8db8c7c523e1798',1,'fbgemm_gpu']]], + ['shfl_5fxor_7',['shfl_xor',['../namespacefbgemm__gpu.html#a17b07e8668ed9b29a8b37d21a829723d',1,'fbgemm_gpu']]], + ['should_5fprune_8',['should_prune',['../namespacefbgemm__gpu.html#a4ae09e478c1e9d6a414935fb6cf60f99',1,'fbgemm_gpu']]], + ['size_9',['size',['../classfbgemm__gpu_1_1_tensor_accessor_base.html#a53408e729e4cd52d06e5c577afbfcf9d',1,'fbgemm_gpu::TensorAccessorBase::size()'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#a53408e729e4cd52d06e5c577afbfcf9d',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::size()']]], + ['sizes_10',['sizes',['../classfbgemm__gpu_1_1_tensor_accessor_base.html#a5b7afa180d3bd84115f26a365b167e5e',1,'fbgemm_gpu::TensorAccessorBase']]], + ['sort_11',['sort',['../structfbgemm__gpu_1_1_bitonic_sort.html#ae729c535b885ed8e2aca6d99ef51e4b0',1,'fbgemm_gpu::BitonicSort']]], + ['split_5fadagrad_5ftable_5fupdate_5fkernel_12',['split_adagrad_table_update_kernel',['../gen__embedding__optimizer__adagrad__split__device__kernel_8cuh.html#aae2b7a37c2c14a8e8575336d88932f5e',1,'gen_embedding_optimizer_adagrad_split_device_kernel.cuh']]], + ['split_5fadam_5ftable_5fupdate_5fkernel_13',['split_adam_table_update_kernel',['../gen__embedding__optimizer__adam__split__device__kernel_8cuh.html#a415ebd6751961f1e6826cfe2712cc85e',1,'gen_embedding_optimizer_adam_split_device_kernel.cuh']]], + ['split_5fapprox_5frowwise_5fadagrad_5ftable_5fupdate_5fkernel_14',['split_approx_rowwise_adagrad_table_update_kernel',['../gen__embedding__optimizer__approx__rowwise__adagrad__split__device__kernel_8cuh.html#a9263ef077d631b455021b5cfe68d9632',1,'gen_embedding_optimizer_approx_rowwise_adagrad_split_device_kernel.cuh']]], + ['split_5fapprox_5frowwise_5fadagrad_5fwith_5fcounter_5ftable_5fupdate_5fkernel_15',['split_approx_rowwise_adagrad_with_counter_table_update_kernel',['../gen__embedding__optimizer__approx__rowwise__adagrad__with__counter__split__device__kernel_8cuh.html#a2f7931888711cbd1dff1f7fda564b3a5',1,'gen_embedding_optimizer_approx_rowwise_adagrad_with_counter_split_device_kernel.cuh']]], + ['split_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5ftable_5fupdate_5fkernel_16',['split_approx_rowwise_adagrad_with_weight_decay_table_update_kernel',['../gen__embedding__optimizer__approx__rowwise__adagrad__with__weight__decay__split__device__kernel_8cuh.html#a30fdc78bf391825590b69585779a9baf',1,'gen_embedding_optimizer_approx_rowwise_adagrad_with_weight_decay_split_device_kernel.cuh']]], + ['split_5fapprox_5fsgd_5ftable_5fupdate_5fkernel_17',['split_approx_sgd_table_update_kernel',['../gen__embedding__optimizer__approx__sgd__split__device__kernel_8cuh.html#abcf3f2a323ec4155270a5fcfffecd462',1,'gen_embedding_optimizer_approx_sgd_split_device_kernel.cuh']]], + ['split_5fdense_5ftable_5fupdate_5fkernel_18',['split_dense_table_update_kernel',['../gen__embedding__optimizer__dense__split__device__kernel_8cuh.html#a9a55851e1eec2af9f174c94e138a4aa7',1,'gen_embedding_optimizer_dense_split_device_kernel.cuh']]], + ['split_5fembedding_5fbackward_5fcodegen_5fadagrad_5fcpu_19',['split_embedding_backward_codegen_adagrad_cpu',['../gen__embedding__backward__adagrad__split__cpu_8cpp.html#a5e9389fec0497e9f90df6043627319ca',1,'split_embedding_backward_codegen_adagrad_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, Tensor momentum1_host, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_adagrad_split_cpu.cpp'],['../gen__embedding__backward__split__adagrad__cpu_8cpp.html#a5e9389fec0497e9f90df6043627319ca',1,'split_embedding_backward_codegen_adagrad_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, Tensor momentum1_host, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_adagrad_split_cpu.cpp']]], + ['split_5fembedding_5fbackward_5fcodegen_5fadagrad_5funweighted_5fexact_5fcuda_20',['split_embedding_backward_codegen_adagrad_unweighted_exact_cuda',['../gen__embedding__backward__adagrad__split__unweighted__cuda_8cu.html#a1207210a9545e9575750541d0b87d2ff',1,'split_embedding_backward_codegen_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate): gen_embedding_backward_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__adagrad_8cpp.html#a06b1cf5ad03a298c5257a31b33524398',1,'split_embedding_backward_codegen_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0): gen_embedding_backward_adagrad_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fadagrad_5fweighted_5fexact_5fcuda_21',['split_embedding_backward_codegen_adagrad_weighted_exact_cuda',['../gen__embedding__backward__adagrad__split__weighted__cuda_8cu.html#a0e8cc9d4217b55864ac828677d7d546d',1,'split_embedding_backward_codegen_adagrad_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate): gen_embedding_backward_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__split__adagrad_8cpp.html#affb9be553e49e7bea6a6c3f60b63dc04',1,'split_embedding_backward_codegen_adagrad_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0): gen_embedding_backward_adagrad_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fadam_5funweighted_5fexact_5fcuda_22',['split_embedding_backward_codegen_adam_unweighted_exact_cuda',['../gen__embedding__backward__adam__split__unweighted__cuda_8cu.html#a7c3fa518fa48a831ea3f8e691672808e',1,'split_embedding_backward_codegen_adam_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__adam_8cpp.html#ae27a3d26d13d596aaaa1e621990e0d71',1,'split_embedding_backward_codegen_adam_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_adam_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fadam_5fweighted_5fexact_5fcuda_23',['split_embedding_backward_codegen_adam_weighted_exact_cuda',['../gen__embedding__backward__adam__split__weighted__cuda_8cu.html#aea34407b88c9df5b3be55e8ea24a347d',1,'split_embedding_backward_codegen_adam_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_cuda.cu'],['../gen__embedding__backward__split__adam_8cpp.html#a8e4ae3bed221149c3b3ab6a5c0f38605',1,'split_embedding_backward_codegen_adam_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_adam_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5funweighted_5fexact_5fcuda_24',['split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html#a346e3b137705a7c27ea4448090c853ca',1,'split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html#afbce26182226d45104cf25fc6ebf90df',1,'split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fweighted_5fexact_5fcuda_25',['split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_exact_cuda',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html#a1ff3b73be256bfc5b6a6a92c35f5c101',1,'split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html#ae5ec715aff7b59ae2cd64991053a8744',1,'split_embedding_backward_codegen_approx_rowwise_adagrad_with_weight_decay_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fdense_5fcpu_26',['split_embedding_backward_codegen_dense_cpu',['../gen__embedding__backward__dense__split__cpu_8cpp.html#a9872de3651e55555a2bea1c407c45c5d',1,'split_embedding_backward_codegen_dense_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, double unused=0): gen_embedding_backward_dense_split_cpu.cpp'],['../embedding__backward__dense__host__cpu_8cpp.html#a16114b295cd4bb55fd704d1cc575284f',1,'split_embedding_backward_codegen_dense_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, double unused): gen_embedding_backward_dense_split_cpu.cpp']]], + ['split_5fembedding_5fbackward_5fcodegen_5fdense_5funweighted_5fexact_5fcuda_27',['split_embedding_backward_codegen_dense_unweighted_exact_cuda',['../gen__embedding__backward__dense__split__unweighted__cuda_8cu.html#af39484621a2a43237ee275c7d9497e16',1,'split_embedding_backward_codegen_dense_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const int64_t unused_, const int64_t max_segment_length_per_warp, double unused): gen_embedding_backward_dense_split_unweighted_cuda.cu'],['../embedding__backward__dense__host_8cpp.html#aebdb9ab2fd0166beebd42528ea223ac4',1,'split_embedding_backward_codegen_dense_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const double unused): gen_embedding_backward_dense_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fdense_5fweighted_5fexact_5fcuda_28',['split_embedding_backward_codegen_dense_weighted_exact_cuda',['../gen__embedding__backward__dense__split__weighted__cuda_8cu.html#aeae20f9c1a93bb4297f2710fe00723a2',1,'split_embedding_backward_codegen_dense_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const int64_t unused_, const int64_t max_segment_length_per_warp, double unused): gen_embedding_backward_dense_split_weighted_cuda.cu'],['../embedding__backward__dense__host_8cpp.html#a4a920500b84d7febde7964cfa515c690',1,'split_embedding_backward_codegen_dense_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const double unused): gen_embedding_backward_dense_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5flamb_5funweighted_5fexact_5fcuda_29',['split_embedding_backward_codegen_lamb_unweighted_exact_cuda',['../gen__embedding__backward__lamb__split__unweighted__cuda_8cu.html#a45b16bde5dcd4ed361824c02fb19aa28',1,'split_embedding_backward_codegen_lamb_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__lamb_8cpp.html#abafaac43ca0a5d04be6280c0db92ef81',1,'split_embedding_backward_codegen_lamb_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_lamb_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5flamb_5fweighted_5fexact_5fcuda_30',['split_embedding_backward_codegen_lamb_weighted_exact_cuda',['../gen__embedding__backward__lamb__split__weighted__cuda_8cu.html#ac26e29ea75fba6b9f3922118cd293b96',1,'split_embedding_backward_codegen_lamb_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_cuda.cu'],['../gen__embedding__backward__split__lamb_8cpp.html#a2cb504a8487e7581fcf600c9dd9bb4da',1,'split_embedding_backward_codegen_lamb_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_lamb_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5flars_5fsgd_5funweighted_5fexact_5fcuda_31',['split_embedding_backward_codegen_lars_sgd_unweighted_exact_cuda',['../gen__embedding__backward__lars__sgd__split__unweighted__cuda_8cu.html#a68717d5b465de7efb3f58ca7f1c9c48e',1,'split_embedding_backward_codegen_lars_sgd_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double learning_rate, double eta, double momentum, double weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__lars__sgd_8cpp.html#ad6a69a83e0c09e08c8854f3a988349c2',1,'split_embedding_backward_codegen_lars_sgd_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double learning_rate=0, double eta=0, double momentum=0, double weight_decay=0): gen_embedding_backward_lars_sgd_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5flars_5fsgd_5fweighted_5fexact_5fcuda_32',['split_embedding_backward_codegen_lars_sgd_weighted_exact_cuda',['../gen__embedding__backward__lars__sgd__split__weighted__cuda_8cu.html#a3eff146e8f81f6d6dcc6e08f791b1c27',1,'split_embedding_backward_codegen_lars_sgd_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double learning_rate, double eta, double momentum, double weight_decay): gen_embedding_backward_lars_sgd_split_weighted_cuda.cu'],['../gen__embedding__backward__split__lars__sgd_8cpp.html#a592a95a9e623ca87fb31c88bc11ef217',1,'split_embedding_backward_codegen_lars_sgd_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double learning_rate=0, double eta=0, double momentum=0, double weight_decay=0): gen_embedding_backward_lars_sgd_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fnone_5funweighted_5fexact_5fcuda_33',['split_embedding_backward_codegen_none_unweighted_exact_cuda',['../gen__embedding__backward__none__split__unweighted__cuda_8cu.html#ac780b945eb2c0cff713ff7280122da42',1,'split_embedding_backward_codegen_none_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__none_8cpp.html#ab8077c80baaf216fec8c7c0c81cd0c29',1,'split_embedding_backward_codegen_none_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, int64_t total_hash_size=0, int64_t total_unique_indices=0): gen_embedding_backward_none_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fnone_5fweighted_5fexact_5fcuda_34',['split_embedding_backward_codegen_none_weighted_exact_cuda',['../gen__embedding__backward__none__split__weighted__cuda_8cu.html#a12b41a32a38b812420382dfb33a09e17',1,'split_embedding_backward_codegen_none_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_cuda.cu'],['../gen__embedding__backward__split__none_8cpp.html#a7808efa8b7d1caa4534528c97b55a26b',1,'split_embedding_backward_codegen_none_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, int64_t total_hash_size=0, int64_t total_unique_indices=0): gen_embedding_backward_none_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fpartial_5frowwise_5fadam_5funweighted_5fexact_5fcuda_35',['split_embedding_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda',['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__cuda_8cu.html#a4d39b6b803c05c33caf58b4a2fbf37ac',1,'split_embedding_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html#a3d1da3b63c8a16884d3de8d52c0b99fd',1,'split_embedding_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_partial_rowwise_adam_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fpartial_5frowwise_5fadam_5fweighted_5fexact_5fcuda_36',['split_embedding_backward_codegen_partial_rowwise_adam_weighted_exact_cuda',['../gen__embedding__backward__partial__rowwise__adam__split__weighted__cuda_8cu.html#ac295880f03c86cb263b324158e460e82',1,'split_embedding_backward_codegen_partial_rowwise_adam_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html#aaa1e9d0adf68022fa575a63182a95745',1,'split_embedding_backward_codegen_partial_rowwise_adam_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_partial_rowwise_adam_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fpartial_5frowwise_5flamb_5funweighted_5fexact_5fcuda_37',['split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda',['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__cuda_8cu.html#a561ce0f1da43ca47001db85a395203e1',1,'split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html#ab047f1b46e810b2a48f66387d37cd588',1,'split_embedding_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fpartial_5frowwise_5flamb_5fweighted_5fexact_5fcuda_38',['split_embedding_backward_codegen_partial_rowwise_lamb_weighted_exact_cuda',['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__cuda_8cu.html#a70ac0537228900edc94bbd437c550a15',1,'split_embedding_backward_codegen_partial_rowwise_lamb_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html#a6619694897abaeee44b975fa9614d7e3',1,'split_embedding_backward_codegen_partial_rowwise_lamb_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_partial_rowwise_lamb_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5fcpu_39',['split_embedding_backward_codegen_rowwise_adagrad_cpu',['../gen__embedding__backward__rowwise__adagrad__split__cpu_8cpp.html#a73c1fd212c2c324e57b0f906a2598360',1,'split_embedding_backward_codegen_rowwise_adagrad_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, Tensor momentum1_host, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0, double max_norm=0.0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_rowwise_adagrad_split_cpu.cpp'],['../gen__embedding__backward__split__rowwise__adagrad__cpu_8cpp.html#a73c1fd212c2c324e57b0f906a2598360',1,'split_embedding_backward_codegen_rowwise_adagrad_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, Tensor momentum1_host, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0, double max_norm=0.0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_rowwise_adagrad_split_cpu.cpp']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5funweighted_5fexact_5fcuda_40',['split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_cuda',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__cuda_8cu.html#aca4e3268cb308c63a299f50cde66dec1',1,'split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode, double max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#ad73707297535524e1eeff86f23adfdfa',1,'split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0, double max_norm=0.0): gen_embedding_backward_rowwise_adagrad_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5funweighted_5fexact_5fvbe_5fcuda_41',['split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_vbe_cuda',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__cuda_8cu.html#a85ffab9880f2b1221f86a7f63c088096',1,'split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const Tensor &B_offsets, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode, double max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#ae52a1e89225c55716b2505ef0b14b32c',1,'split_embedding_backward_codegen_rowwise_adagrad_unweighted_exact_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const Tensor &B_offsets, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0, double max_norm=0.0): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5fweighted_5fexact_5fcuda_42',['split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_cuda',['../gen__embedding__backward__rowwise__adagrad__split__weighted__cuda_8cu.html#a9e02b82c5db58357a98bc86454c2d7a5',1,'split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode, double max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#a394f3f0a5cbe256e703c0bb34bfe50b3',1,'split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0, double max_norm=0.0): gen_embedding_backward_rowwise_adagrad_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5fweighted_5fexact_5fvbe_5fcuda_43',['split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_vbe_cuda',['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__cuda_8cu.html#a0266589d7dcf9f22a9398090ae16abac',1,'split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const Tensor &B_offsets, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode, double max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#af257dbbdb6a2c64fdb2e038bb39190c1',1,'split_embedding_backward_codegen_rowwise_adagrad_weighted_exact_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const Tensor &B_offsets, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0, double max_norm=0.0): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5fwith_5fcounter_5fcpu_44',['split_embedding_backward_codegen_rowwise_adagrad_with_counter_cpu',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__cpu_8cpp.html#a2e90723fcad83f3054bc6f661de849c1',1,'split_embedding_backward_codegen_rowwise_adagrad_with_counter_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, Tensor momentum1_host, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_host, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_host, Tensor row_counter_placements, Tensor row_counter_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t iter=0, int64_t counter_halflife=-1, int64_t adjustment_iter=-1, double adjustment_ub=1.0, int64_t learning_rate_mode=-1, int64_t weight_decay_mode=1, int64_t grad_sum_decay=-1, double max_counter=0, double tail_id_threshold=0.0, int64_t is_tail_id_thresh_ratio=0, int64_t regularization_mode=0, double weight_norm_coefficient=0.0, double lower_bound=0.0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_rowwise_adagrad_with_counter_split_cpu.cpp'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter__cpu_8cpp.html#a2e90723fcad83f3054bc6f661de849c1',1,'split_embedding_backward_codegen_rowwise_adagrad_with_counter_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, Tensor momentum1_host, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_host, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_host, Tensor row_counter_placements, Tensor row_counter_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t iter=0, int64_t counter_halflife=-1, int64_t adjustment_iter=-1, double adjustment_ub=1.0, int64_t learning_rate_mode=-1, int64_t weight_decay_mode=1, int64_t grad_sum_decay=-1, double max_counter=0, double tail_id_threshold=0.0, int64_t is_tail_id_thresh_ratio=0, int64_t regularization_mode=0, double weight_norm_coefficient=0.0, double lower_bound=0.0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_rowwise_adagrad_with_counter_split_cpu.cpp']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5fwith_5fcounter_5funweighted_5fexact_5fcuda_45',['split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__cuda_8cu.html#afa64170f02313b2766c2cc3e25d2f5a9',1,'split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_dev, Tensor prev_iter_uvm, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_dev, Tensor row_counter_uvm, Tensor row_counter_placements, Tensor row_counter_offsets, double eps, double learning_rate, double weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, double adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, double max_counter, double tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, double weight_norm_coefficient, double lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html#aea7503341318b3b0142a83d310046516',1,'split_embedding_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_dev, Tensor prev_iter_uvm, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_dev, Tensor row_counter_uvm, Tensor row_counter_placements, Tensor row_counter_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t iter=0, int64_t counter_halflife=-1, int64_t adjustment_iter=-1, double adjustment_ub=1.0, int64_t learning_rate_mode=-1, int64_t weight_decay_mode=1, int64_t grad_sum_decay=-1, double max_counter=0, double tail_id_threshold=0.0, int64_t is_tail_id_thresh_ratio=0, int64_t regularization_mode=0, double weight_norm_coefficient=0.0, double lower_bound=0.0): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5fwith_5fcounter_5fweighted_5fexact_5fcuda_46',['split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_exact_cuda',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__cuda_8cu.html#aeed29f5cd2c5bacfd4ed37b2381c128b',1,'split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_dev, Tensor prev_iter_uvm, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_dev, Tensor row_counter_uvm, Tensor row_counter_placements, Tensor row_counter_offsets, double eps, double learning_rate, double weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, double adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, double max_counter, double tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, double weight_norm_coefficient, double lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html#a07c978ecc3495651d0123d01876f68ca',1,'split_embedding_backward_codegen_rowwise_adagrad_with_counter_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_dev, Tensor prev_iter_uvm, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_dev, Tensor row_counter_uvm, Tensor row_counter_placements, Tensor row_counter_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t iter=0, int64_t counter_halflife=-1, int64_t adjustment_iter=-1, double adjustment_ub=1.0, int64_t learning_rate_mode=-1, int64_t weight_decay_mode=1, int64_t grad_sum_decay=-1, double max_counter=0, double tail_id_threshold=0.0, int64_t is_tail_id_thresh_ratio=0, int64_t regularization_mode=0, double weight_norm_coefficient=0.0, double lower_bound=0.0): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5funweighted_5fexact_5fcuda_47',['split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html#acc9cd7c72b1624ec0df8d9f4edbde2cb',1,'split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html#a9f5e043a0a43d92b7a748c27e6ce8060',1,'split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5fweighted_5fexact_5fcuda_48',['split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_exact_cuda',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html#a969bc368ad46c57ab47feac737df5001',1,'split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html#a6cb23330ccfc55cc78d828d1fd8b59fb',1,'split_embedding_backward_codegen_rowwise_adagrad_with_weight_decay_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fweighted_5fadagrad_5fcpu_49',['split_embedding_backward_codegen_rowwise_weighted_adagrad_cpu',['../gen__embedding__backward__rowwise__weighted__adagrad__split__cpu_8cpp.html#acb5592b9d0b5b9344302f69c0f1be10b',1,'split_embedding_backward_codegen_rowwise_weighted_adagrad_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, Tensor momentum1_host, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0, int64_t iter=0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_rowwise_weighted_adagrad_split_cpu.cpp'],['../gen__embedding__backward__split__rowwise__weighted__adagrad__cpu_8cpp.html#acb5592b9d0b5b9344302f69c0f1be10b',1,'split_embedding_backward_codegen_rowwise_weighted_adagrad_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, Tensor momentum1_host, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0, int64_t iter=0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_rowwise_weighted_adagrad_split_cpu.cpp']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fweighted_5fadagrad_5funweighted_5fexact_5fcuda_50',['split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda',['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__cuda_8cu.html#a10025996061290114d6060505057ce7b',1,'split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html#a3a9f041d93d95908fbe76052c3d48a3e',1,'split_embedding_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5frowwise_5fweighted_5fadagrad_5fweighted_5fexact_5fcuda_51',['split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_exact_cuda',['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__cuda_8cu.html#a74ae14449034e73352a950be7faee8cd',1,'split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html#aad0ff2a4c042997b9969d779d3c91c59',1,'split_embedding_backward_codegen_rowwise_weighted_adagrad_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fsgd_5fcpu_52',['split_embedding_backward_codegen_sgd_cpu',['../gen__embedding__backward__sgd__split__cpu_8cpp.html#a9d914bb02aed97803fcc9237f00403fa',1,'split_embedding_backward_codegen_sgd_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, double learning_rate=0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_sgd_split_cpu.cpp'],['../gen__embedding__backward__split__sgd__cpu_8cpp.html#a9d914bb02aed97803fcc9237f00403fa',1,'split_embedding_backward_codegen_sgd_cpu(Tensor grad_output, Tensor host_weights, Tensor weights_placements, Tensor weights_offsets, Tensor D_offsets, int64_t max_D, Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, bool stochastic_rounding, double learning_rate=0, int64_t output_dtype=static_cast< int64_t >(SparseType::FP32)): gen_embedding_backward_sgd_split_cpu.cpp']]], + ['split_5fembedding_5fbackward_5fcodegen_5fsgd_5funweighted_5fexact_5fcuda_53',['split_embedding_backward_codegen_sgd_unweighted_exact_cuda',['../gen__embedding__backward__sgd__split__unweighted__cuda_8cu.html#ad2d75e84d796d6d8fae77c19e7a8af3b',1,'split_embedding_backward_codegen_sgd_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate): gen_embedding_backward_sgd_split_unweighted_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#a8f7618b0f318fed552700a9303e0c500',1,'split_embedding_backward_codegen_sgd_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate=0): gen_embedding_backward_sgd_split_unweighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fsgd_5funweighted_5fexact_5fvbe_5fcuda_54',['split_embedding_backward_codegen_sgd_unweighted_exact_vbe_cuda',['../gen__embedding__backward__sgd__split__unweighted__vbe__cuda_8cu.html#a216acb29a8d546146f5593b5abd7eaa1',1,'split_embedding_backward_codegen_sgd_unweighted_exact_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const Tensor &B_offsets, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#aa75d2899ee39c0d5f71e426d1cc7d57c',1,'split_embedding_backward_codegen_sgd_unweighted_exact_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const Tensor &B_offsets, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate=0): gen_embedding_backward_sgd_split_unweighted_vbe_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fsgd_5fweighted_5fexact_5fcuda_55',['split_embedding_backward_codegen_sgd_weighted_exact_cuda',['../gen__embedding__backward__sgd__split__weighted__cuda_8cu.html#a16ec895b54d5b04f3fdfa67930c1c02a',1,'split_embedding_backward_codegen_sgd_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate): gen_embedding_backward_sgd_split_weighted_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#a2934aefc05b7ad4bc6e07074f0a2ee1e',1,'split_embedding_backward_codegen_sgd_weighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate=0): gen_embedding_backward_sgd_split_weighted_cuda.cu']]], + ['split_5fembedding_5fbackward_5fcodegen_5fsgd_5fweighted_5fexact_5fvbe_5fcuda_56',['split_embedding_backward_codegen_sgd_weighted_exact_vbe_cuda',['../gen__embedding__backward__sgd__split__weighted__vbe__cuda_8cu.html#a9ee8617b61b6a4be1391fe53321bf927',1,'split_embedding_backward_codegen_sgd_weighted_exact_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const Tensor &B_offsets, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#a67f194387a7e81de22d969964f1cc379',1,'split_embedding_backward_codegen_sgd_weighted_exact_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const Tensor &B_offsets, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate=0): gen_embedding_backward_sgd_split_weighted_vbe_cuda.cu']]], + ['split_5fembedding_5fcodegen_5fforward_5fcpu_57',['split_embedding_codegen_forward_cpu',['../embedding__forward__split__cpu_8cpp.html#aaf201bc6f5c8deb12999a3eff03cf7bb',1,'split_embedding_codegen_forward_cpu(Tensor weights, Tensor weights_offsets, Tensor D_offsets, int64_t total_D, Tensor hash_size_cumsum, Tensor indices, Tensor offsets, int64_t pooling_mode, Tensor indice_weights, int64_t output_dtype): embedding_forward_split_cpu.cpp'],['../embedding__forward__split__cpu_8h.html#a01e2ccf0c687aa129f511c048dd878a2',1,'split_embedding_codegen_forward_cpu(at::Tensor weights, at::Tensor weights_offsets, at::Tensor D_offsets, int64_t total_D, at::Tensor hash_size_cumsum, at::Tensor indices, at::Tensor offsets, int64_t pooling_mode, at::Tensor indice_weights, int64_t output_dtype=0): embedding_forward_split_cpu.cpp']]], + ['split_5fembedding_5fcodegen_5fforward_5fcpu_5fmeta_58',['split_embedding_codegen_forward_cpu_meta',['../embedding__forward__split__cpu_8cpp.html#a0641f4b915d503586cb2d251029169e4',1,'embedding_forward_split_cpu.cpp']]], + ['split_5fembedding_5fcodegen_5fforward_5funweighted_5fcuda_59',['split_embedding_codegen_forward_unweighted_cuda',['../gen__embedding__backward__split__adagrad_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__adam_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__lamb_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__lars__sgd_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__none_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#abc6855d69e1817ffa8c28948673c7b0b',1,'split_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu']]], + ['split_5fembedding_5fcodegen_5fforward_5funweighted_5fmeta_60',['split_embedding_codegen_forward_unweighted_meta',['../gen__embedding__forward__split__unweighted__codegen__meta_8cpp.html#a2b7fe88621ffc9b8dc0b55efafb6cb83',1,'gen_embedding_forward_split_unweighted_codegen_meta.cpp']]], + ['split_5fembedding_5fcodegen_5fforward_5funweighted_5fvbe_5fcuda_61',['split_embedding_codegen_forward_unweighted_vbe_cuda',['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#a028ac1d276dc02b3db5e9195eea165f3',1,'split_embedding_codegen_forward_unweighted_vbe_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const int64_t vbe_output_size, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool is_experimental): gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#a028ac1d276dc02b3db5e9195eea165f3',1,'split_embedding_codegen_forward_unweighted_vbe_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const int64_t vbe_output_size, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool is_experimental): gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#a028ac1d276dc02b3db5e9195eea165f3',1,'split_embedding_codegen_forward_unweighted_vbe_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &lxu_cache_locations, const int64_t output_dtype, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const int64_t vbe_output_size, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool is_experimental): gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu']]], + ['split_5fembedding_5fcodegen_5fforward_5funweighted_5fvbe_5fmeta_62',['split_embedding_codegen_forward_unweighted_vbe_meta',['../gen__embedding__forward__split__unweighted__vbe__codegen__meta_8cpp.html#ac45ac774af2f2cdc3ef15fccacbc9866',1,'gen_embedding_forward_split_unweighted_vbe_codegen_meta.cpp']]], + ['split_5fembedding_5fcodegen_5fforward_5fweighted_5fcuda_63',['split_embedding_codegen_forward_weighted_cuda',['../gen__embedding__backward__split__adagrad_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__adam_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__lamb_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__lars__sgd_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__none_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#a25265c0efdc4f020ea5b8d5e730dfb31',1,'split_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_weighted_codegen_cuda.cu']]], + ['split_5fembedding_5fcodegen_5fforward_5fweighted_5fmeta_64',['split_embedding_codegen_forward_weighted_meta',['../gen__embedding__forward__split__weighted__codegen__meta_8cpp.html#a3f1b063bf337baa7c85cd891f50dcb17',1,'gen_embedding_forward_split_weighted_codegen_meta.cpp']]], + ['split_5fembedding_5fcodegen_5fforward_5fweighted_5fvbe_5fcuda_65',['split_embedding_codegen_forward_weighted_vbe_cuda',['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#a0b7f13ed95640b7a8e42d3f0ff3f2b46',1,'split_embedding_codegen_forward_weighted_vbe_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const int64_t vbe_output_size, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool is_experimental): gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#a0b7f13ed95640b7a8e42d3f0ff3f2b46',1,'split_embedding_codegen_forward_weighted_vbe_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const int64_t vbe_output_size, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool is_experimental): gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#a0b7f13ed95640b7a8e42d3f0ff3f2b46',1,'split_embedding_codegen_forward_weighted_vbe_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const Tensor &lxu_cache_locations, const int64_t output_dtype, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const int64_t vbe_output_size, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool is_experimental): gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu']]], + ['split_5fembedding_5fcodegen_5fforward_5fweighted_5fvbe_5fmeta_66',['split_embedding_codegen_forward_weighted_vbe_meta',['../gen__embedding__forward__split__weighted__vbe__codegen__meta_8cpp.html#aafe550801c2d2c26cf43ccef3a6ac0e9',1,'gen_embedding_forward_split_weighted_vbe_codegen_meta.cpp']]], + ['split_5fembedding_5fcodegen_5fgrad_5findice_5fweights_5fcpu_67',['split_embedding_codegen_grad_indice_weights_cpu',['../embedding__forward__split__cpu_8cpp.html#a03b54fa4944d00f3984442a980742701',1,'split_embedding_codegen_grad_indice_weights_cpu(Tensor grad_output, Tensor weights, Tensor weights_offsets, Tensor D_offsets, Tensor indices, Tensor offsets, Tensor feature_requires_grad): embedding_forward_split_cpu.cpp'],['../embedding__forward__split__cpu_8h.html#a371a7887c9af52b22bdc10e84d5c2ba6',1,'split_embedding_codegen_grad_indice_weights_cpu(at::Tensor grad_output, at::Tensor weights, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor indices, at::Tensor offsets, at::Tensor feature_requires_grad): embedding_forward_split_cpu.cpp']]], + ['split_5fembedding_5fcodegen_5fgrad_5findice_5fweights_5fcuda_68',['split_embedding_codegen_grad_indice_weights_cuda',['../gen__embedding__backward__split__adagrad_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__adam_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__indice__weights__codegen__cuda_8cu.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__lamb_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__lars__sgd_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__none_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#a1fb867d681110d956ddaf10f110156a1',1,'split_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad): gen_embedding_backward_split_indice_weights_codegen_cuda.cu']]], + ['split_5fembedding_5fcodegen_5fgrad_5findice_5fweights_5fvbe_5fcuda_69',['split_embedding_codegen_grad_indice_weights_vbe_cuda',['../gen__embedding__backward__split__indice__weights__codegen__cuda_8cu.html#ae8a987f07ba5142ffd7a0733824925a2',1,'split_embedding_codegen_grad_indice_weights_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const int64_t info_B_num_bits, const int64_t info_B_mask_int64): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#ae8a987f07ba5142ffd7a0733824925a2',1,'split_embedding_codegen_grad_indice_weights_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const int64_t info_B_num_bits, const int64_t info_B_mask_int64): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#ae8a987f07ba5142ffd7a0733824925a2',1,'split_embedding_codegen_grad_indice_weights_vbe_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const Tensor &feature_requires_grad, const Tensor &vbe_row_output_offsets, const Tensor &vbe_b_t_map, const int64_t info_B_num_bits, const int64_t info_B_mask_int64): gen_embedding_backward_split_indice_weights_codegen_cuda.cu']]], + ['split_5fembedding_5fcodegen_5flookup_5fadagrad_5ffunction_70',['split_embedding_codegen_lookup_adagrad_function',['../group__embedding-cuda.html#gaa0988eef90f8662e8886912ed3784c1d',1,'gen_embedding_backward_split_adagrad.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fadam_5ffunction_71',['split_embedding_codegen_lookup_adam_function',['../group__embedding-cuda.html#ga639ddbb31e9d565bfcfa4766b14c9ef6',1,'gen_embedding_backward_split_adam.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fapprox_5frowwise_5fadagrad_5ffunction_72',['split_embedding_codegen_lookup_approx_rowwise_adagrad_function',['../group__embedding-cuda.html#gac847393d811e7b22ace39ff91eb91e27',1,'gen_embedding_backward_split_approx_rowwise_adagrad.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fapprox_5frowwise_5fadagrad_5fwith_5fcounter_5ffunction_73',['split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function',['../group__embedding-cuda.html#gabf7587752fb66934350cec59cd7adda9',1,'gen_embedding_backward_split_approx_rowwise_adagrad_with_counter.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5ffunction_74',['split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function',['../group__embedding-cuda.html#ga0a7191adb6807417bfaab85ccb6fac50',1,'gen_embedding_backward_split_approx_rowwise_adagrad_with_weight_decay.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fapprox_5fsgd_5ffunction_75',['split_embedding_codegen_lookup_approx_sgd_function',['../group__embedding-cuda.html#gabcff81381942478b57805e5deb7725fb',1,'gen_embedding_backward_split_approx_sgd.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fdense_5ffunction_76',['split_embedding_codegen_lookup_dense_function',['../embedding__backward__dense__host_8cpp.html#a04b7d97e6fd0bbb6e9877db0c1b7e506',1,'embedding_backward_dense_host.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5flamb_5ffunction_77',['split_embedding_codegen_lookup_lamb_function',['../group__embedding-cuda.html#ga1c377dd2500d38974bbfe0e69243e084',1,'gen_embedding_backward_split_lamb.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5flars_5fsgd_5ffunction_78',['split_embedding_codegen_lookup_lars_sgd_function',['../group__embedding-cuda.html#ga5c0d733a2e781ea4c9fc5ab3a2d6ccf3',1,'gen_embedding_backward_split_lars_sgd.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fnone_5ffunction_79',['split_embedding_codegen_lookup_none_function',['../group__embedding-cuda.html#ga855a30b389de5a61097f44cff795b6c3',1,'gen_embedding_backward_split_none.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fpartial_5frowwise_5fadam_5ffunction_80',['split_embedding_codegen_lookup_partial_rowwise_adam_function',['../group__embedding-cuda.html#ga06feb6c425fba7c460dc0da550d4e4e6',1,'gen_embedding_backward_split_partial_rowwise_adam.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fpartial_5frowwise_5flamb_5ffunction_81',['split_embedding_codegen_lookup_partial_rowwise_lamb_function',['../group__embedding-cuda.html#ga37b9129c928c9cb39459198f36f11c8d',1,'gen_embedding_backward_split_partial_rowwise_lamb.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5frowwise_5fadagrad_5ffunction_82',['split_embedding_codegen_lookup_rowwise_adagrad_function',['../group__embedding-cuda.html#gacc3d997b675b747985dd37193cac4edd',1,'gen_embedding_backward_split_rowwise_adagrad.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5frowwise_5fadagrad_5fwith_5fcounter_5ffunction_83',['split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function',['../group__embedding-cuda.html#ga917cf0c2c4487425408808529ed05e68',1,'gen_embedding_backward_split_rowwise_adagrad_with_counter.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5ffunction_84',['split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function',['../group__embedding-cuda.html#ga2e19021f546871ef6f1e57fca7cf5e13',1,'gen_embedding_backward_split_rowwise_adagrad_with_weight_decay.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5frowwise_5fweighted_5fadagrad_5ffunction_85',['split_embedding_codegen_lookup_rowwise_weighted_adagrad_function',['../group__embedding-cuda.html#ga54a40e0e64a528731d45bca998727a1c',1,'gen_embedding_backward_split_rowwise_weighted_adagrad.cpp']]], + ['split_5fembedding_5fcodegen_5flookup_5fsgd_5ffunction_86',['split_embedding_codegen_lookup_sgd_function',['../group__embedding-cuda.html#ga66c2eb0df8e5dab40f0d862ebe43bd34',1,'gen_embedding_backward_split_sgd.cpp']]], + ['split_5fembedding_5fforward_5fcpu_5fkernel_87',['split_embedding_forward_cpu_kernel',['../embedding__forward__split__cpu_8cpp.html#af360a949beb9bba72466614e220da13d',1,'embedding_forward_split_cpu.cpp']]], + ['split_5fembedding_5fgrad_5findice_5fweights_5fcpu_5fkernel_88',['split_embedding_grad_indice_weights_cpu_kernel',['../embedding__forward__split__cpu_8cpp.html#a1156d3aee8ccb8a6676b22f78fe0829c',1,'embedding_forward_split_cpu.cpp']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5fadagrad_5funweighted_5fexact_5fcuda_89',['split_embedding_nobag_backward_codegen_adagrad_unweighted_exact_cuda',['../gen__embedding__backward__adagrad__split__unweighted__nobag__cuda_8cu.html#a635c3123249dcf767e8d80923e11a7b1',1,'split_embedding_nobag_backward_codegen_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__adagrad_8cpp.html#ad491e078738cfd46a4d2377948b977fc',1,'split_embedding_nobag_backward_codegen_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0): gen_embedding_backward_adagrad_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5fadam_5funweighted_5fexact_5fcuda_90',['split_embedding_nobag_backward_codegen_adam_unweighted_exact_cuda',['../gen__embedding__backward__adam__split__unweighted__nobag__cuda_8cu.html#a6392bb8bf8131572a96cb5bf5a363152',1,'split_embedding_nobag_backward_codegen_adam_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__adam_8cpp.html#a6a9de0e9036f30dbd7d7e4442ae7e5fe',1,'split_embedding_nobag_backward_codegen_adam_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_adam_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5fapprox_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5funweighted_5fexact_5fcuda_91',['split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda',['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html#a8d755844b3dc430390b0db02833650a7',1,'split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html#a701f363d76409a2aa4df028f12ba0300',1,'split_embedding_nobag_backward_codegen_approx_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5fdense_5funweighted_5fexact_5fcuda_92',['split_embedding_nobag_backward_codegen_dense_unweighted_exact_cuda',['../gen__embedding__backward__dense__split__unweighted__nobag__cuda_8cu.html#a11ce1782edb9d58fffb5fe2581172d70',1,'split_embedding_nobag_backward_codegen_dense_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t unused_, const int64_t max_segment_length_per_warp, double unused): gen_embedding_backward_dense_split_unweighted_nobag_cuda.cu'],['../embedding__backward__dense__host_8cpp.html#a7911ad2a461036b977d8d9f9fafb391a',1,'split_embedding_nobag_backward_codegen_dense_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const double unused): gen_embedding_backward_dense_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5flamb_5funweighted_5fexact_5fcuda_93',['split_embedding_nobag_backward_codegen_lamb_unweighted_exact_cuda',['../gen__embedding__backward__lamb__split__unweighted__nobag__cuda_8cu.html#ad6463435db98705077041803b394dcc3',1,'split_embedding_nobag_backward_codegen_lamb_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__lamb_8cpp.html#a557b019964c8d292ca9923927e0d974a',1,'split_embedding_nobag_backward_codegen_lamb_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_lamb_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5flars_5fsgd_5funweighted_5fexact_5fcuda_94',['split_embedding_nobag_backward_codegen_lars_sgd_unweighted_exact_cuda',['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__cuda_8cu.html#a0e0807f32e264e5a83586907ea3f6749',1,'split_embedding_nobag_backward_codegen_lars_sgd_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double learning_rate, double eta, double momentum, double weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__lars__sgd_8cpp.html#a80df1bf7d746582f689d6bc4652f7266',1,'split_embedding_nobag_backward_codegen_lars_sgd_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double learning_rate=0, double eta=0, double momentum=0, double weight_decay=0): gen_embedding_backward_lars_sgd_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5fnone_5funweighted_5fexact_5fcuda_95',['split_embedding_nobag_backward_codegen_none_unweighted_exact_cuda',['../gen__embedding__backward__none__split__unweighted__nobag__cuda_8cu.html#af181f8da92e59fb5da465d0931859e77',1,'split_embedding_nobag_backward_codegen_none_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__none_8cpp.html#a1540203f5279dd87016b397fe33fb041',1,'split_embedding_nobag_backward_codegen_none_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, int64_t total_hash_size=0, int64_t total_unique_indices=0): gen_embedding_backward_none_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5fpartial_5frowwise_5fadam_5funweighted_5fexact_5fcuda_96',['split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda',['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__cuda_8cu.html#a96e4c395674727814da03c2e1654487b',1,'split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html#aed21b16681b11ddd3303195bc4e278ec',1,'split_embedding_nobag_backward_codegen_partial_rowwise_adam_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5fpartial_5frowwise_5flamb_5funweighted_5fexact_5fcuda_97',['split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda',['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__cuda_8cu.html#a41c428effc52b315649ebd4bda728619',1,'split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate, double eps, double beta1, double beta2, double weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html#ad14c41705ba6da0dc89b8802945b9a3a',1,'split_embedding_nobag_backward_codegen_partial_rowwise_lamb_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor momentum2_dev, Tensor momentum2_uvm, Tensor momentum2_placements, Tensor momentum2_offsets, double learning_rate=0, double eps=0, double beta1=0, double beta2=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5frowwise_5fadagrad_5funweighted_5fexact_5fcuda_98',['split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_exact_cuda',['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__cuda_8cu.html#a05fd1c9f2aea152f9cbe2def957c66fb',1,'split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode, double max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#a4bdf992307f845985594c371275668a8',1,'split_embedding_nobag_backward_codegen_rowwise_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0, double max_norm=0.0): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5frowwise_5fadagrad_5fwith_5fcounter_5funweighted_5fexact_5fcuda_99',['split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda',['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__cuda_8cu.html#a0377d50ef90391567b4819a19bffb34c',1,'split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_dev, Tensor prev_iter_uvm, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_dev, Tensor row_counter_uvm, Tensor row_counter_placements, Tensor row_counter_offsets, double eps, double learning_rate, double weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, double adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, double max_counter, double tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, double weight_norm_coefficient, double lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html#a7a94588a2cce7c8cad5f1654d5724ea3',1,'split_embedding_nobag_backward_codegen_rowwise_adagrad_with_counter_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, Tensor prev_iter_dev, Tensor prev_iter_uvm, Tensor prev_iter_placements, Tensor prev_iter_offsets, Tensor row_counter_dev, Tensor row_counter_uvm, Tensor row_counter_placements, Tensor row_counter_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t iter=0, int64_t counter_halflife=-1, int64_t adjustment_iter=-1, double adjustment_ub=1.0, int64_t learning_rate_mode=-1, int64_t weight_decay_mode=1, int64_t grad_sum_decay=-1, double max_counter=0, double tail_id_threshold=0.0, int64_t is_tail_id_thresh_ratio=0, int64_t regularization_mode=0, double weight_norm_coefficient=0.0, double lower_bound=0.0): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5funweighted_5fexact_5fcuda_100',['split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda',['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html#af25017968213662e5c8c0ab9f5fa7e9a',1,'split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html#a31dd9b41f6ea038416e54092a7fcb594',1,'split_embedding_nobag_backward_codegen_rowwise_adagrad_with_weight_decay_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5frowwise_5fweighted_5fadagrad_5funweighted_5fexact_5fcuda_101',['split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda',['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__cuda_8cu.html#a42435ea3b63f42213a2c24d4aadc84f6',1,'split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps, double learning_rate, double weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html#aaf57ee3cb4514d7ccec1c0f5bd653ed3',1,'split_embedding_nobag_backward_codegen_rowwise_weighted_adagrad_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0, int64_t iter=0): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fbackward_5fcodegen_5fsgd_5funweighted_5fexact_5fcuda_102',['split_embedding_nobag_backward_codegen_sgd_unweighted_exact_cuda',['../gen__embedding__backward__sgd__split__unweighted__nobag__cuda_8cu.html#a2260d3e46945437faae7a44fe015bf7c',1,'split_embedding_nobag_backward_codegen_sgd_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t unused_, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#ad92e69305915e46befca51e7288b428b',1,'split_embedding_nobag_backward_codegen_sgd_unweighted_exact_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t BT_block_size, const int64_t max_segment_length_per_warp, const bool stochastic_rounding, const int64_t info_B_num_bits, const int64_t info_B_mask_int64, const bool use_uniq_cache_locations, const bool use_homogeneous_placements, double learning_rate=0): gen_embedding_backward_sgd_split_unweighted_nobag_cuda.cu']]], + ['split_5fembedding_5fnobag_5fcodegen_5fforward_5funweighted_5fcuda_103',['split_embedding_nobag_codegen_forward_unweighted_cuda',['../gen__embedding__backward__split__adagrad_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__adam_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__lamb_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__lars__sgd_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__none_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__backward__split__sgd_8cpp.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#a0d1a75d11b80a3b6735993fc34eb067e',1,'split_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &uvm_weights, const Tensor &lxu_cache_weights, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const Tensor &lxu_cache_locations, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_split_unweighted_codegen_cuda.cu']]], + ['split_5fembedding_5fnobag_5fcodegen_5fforward_5funweighted_5fmeta_104',['split_embedding_nobag_codegen_forward_unweighted_meta',['../gen__embedding__forward__split__unweighted__codegen__meta_8cpp.html#a580b1b950402848a3c71d7092a69ceb7',1,'gen_embedding_forward_split_unweighted_codegen_meta.cpp']]], + ['split_5fembedding_5frowwise_5fadagrad_5fupdate_105',['split_embedding_rowwise_adagrad_update',['../gen__embedding__optimizer__rowwise__adagrad__split_8cpp.html#a21a7b48ff9760f1aa13e260de4b7d2a9',1,'split_embedding_rowwise_adagrad_update(Tensor &dev_weights, Tensor &uvm_weights, Tensor &lxu_cache_weights, const Tensor &grad_dev_weights, const Tensor &grad_dev_indices, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t max_D, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0, double max_norm=0.0): gen_embedding_optimizer_rowwise_adagrad_split.cpp'],['../gen__embedding__optimizer__rowwise__adagrad__split__cuda_8cu.html#ab369ffc9f9e69eca82b24131247ecfcf',1,'split_embedding_rowwise_adagrad_update(Tensor &dev_weights, Tensor &uvm_weights, Tensor &lxu_cache_weights, const Tensor &grad_dev_weights, const Tensor &grad_dev_indices, const Tensor &weights_placements, const Tensor &weights_offsets, const int64_t max_D, const bool stochastic_rounding, Tensor momentum1_dev, Tensor momentum1_uvm, Tensor momentum1_placements, Tensor momentum1_offsets, double eps=0, double learning_rate=0, double weight_decay=0.0, int64_t weight_decay_mode=0, double max_norm=0.0): gen_embedding_optimizer_rowwise_adagrad_split_cuda.cu']]], + ['split_5flamb_5ftable_5fupdate_5fkernel_106',['split_lamb_table_update_kernel',['../gen__embedding__optimizer__lamb__split__device__kernel_8cuh.html#a2952f72a1e3f88f38246d2954dbee2b1',1,'gen_embedding_optimizer_lamb_split_device_kernel.cuh']]], + ['split_5flars_5fsgd_5ftable_5fupdate_5fkernel_107',['split_lars_sgd_table_update_kernel',['../gen__embedding__optimizer__lars__sgd__split__device__kernel_8cuh.html#af488b727a53946064f329ad042bbf73a',1,'gen_embedding_optimizer_lars_sgd_split_device_kernel.cuh']]], + ['split_5fnone_5ftable_5fupdate_5fkernel_108',['split_none_table_update_kernel',['../gen__embedding__optimizer__none__split__device__kernel_8cuh.html#a2cb53295ff111df7a98fbc7573469c61',1,'gen_embedding_optimizer_none_split_device_kernel.cuh']]], + ['split_5fpartial_5frowwise_5fadam_5ftable_5fupdate_5fkernel_109',['split_partial_rowwise_adam_table_update_kernel',['../gen__embedding__optimizer__partial__rowwise__adam__split__device__kernel_8cuh.html#a278aedfb9f50b7f5486dbc97e87cab8e',1,'gen_embedding_optimizer_partial_rowwise_adam_split_device_kernel.cuh']]], + ['split_5fpartial_5frowwise_5flamb_5ftable_5fupdate_5fkernel_110',['split_partial_rowwise_lamb_table_update_kernel',['../gen__embedding__optimizer__partial__rowwise__lamb__split__device__kernel_8cuh.html#a950ea306504584d6cc2050caf007295c',1,'gen_embedding_optimizer_partial_rowwise_lamb_split_device_kernel.cuh']]], + ['split_5frowwise_5fadagrad_5ftable_5fupdate_5fkernel_111',['split_rowwise_adagrad_table_update_kernel',['../gen__embedding__optimizer__rowwise__adagrad__split__device__kernel_8cuh.html#aab5a925ed0316c38c00fcce3b1adc50a',1,'gen_embedding_optimizer_rowwise_adagrad_split_device_kernel.cuh']]], + ['split_5frowwise_5fadagrad_5fwith_5fcounter_5ftable_5fupdate_5fkernel_112',['split_rowwise_adagrad_with_counter_table_update_kernel',['../gen__embedding__optimizer__rowwise__adagrad__with__counter__split__device__kernel_8cuh.html#aa7708111891a0d2eeeda7881715427bb',1,'gen_embedding_optimizer_rowwise_adagrad_with_counter_split_device_kernel.cuh']]], + ['split_5frowwise_5fadagrad_5fwith_5fweight_5fdecay_5ftable_5fupdate_5fkernel_113',['split_rowwise_adagrad_with_weight_decay_table_update_kernel',['../gen__embedding__optimizer__rowwise__adagrad__with__weight__decay__split__device__kernel_8cuh.html#ae265a93446a3c4665e857bc8b2f7d8d7',1,'gen_embedding_optimizer_rowwise_adagrad_with_weight_decay_split_device_kernel.cuh']]], + ['split_5frowwise_5fweighted_5fadagrad_5ftable_5fupdate_5fkernel_114',['split_rowwise_weighted_adagrad_table_update_kernel',['../gen__embedding__optimizer__rowwise__weighted__adagrad__split__device__kernel_8cuh.html#a54b1af3a7b8db5fce48d934e47656c50',1,'gen_embedding_optimizer_rowwise_weighted_adagrad_split_device_kernel.cuh']]], + ['split_5fsgd_5ftable_5fupdate_5fkernel_115',['split_sgd_table_update_kernel',['../gen__embedding__optimizer__sgd__split__device__kernel_8cuh.html#ab768e225fdd76b64ab5c9114ed3cc7cc',1,'gen_embedding_optimizer_sgd_split_device_kernel.cuh']]], + ['splitmix64_5fstateless_116',['splitmix64_stateless',['../namespacefbgemm__gpu.html#aa5ada0472a8306dea17df0d7d1d42abc',1,'fbgemm_gpu']]], + ['ssd_5fcache_5fpopulate_5factions_5fcuda_117',['ssd_cache_populate_actions_cuda',['../ssd__split__embeddings__cache__cuda_8cu.html#a872136033719ff00d6b05e94e4b1cbab',1,'ssd_cache_populate_actions_cuda(Tensor linear_indices, int64_t total_hash_size, Tensor lxu_cache_state, int64_t time_stamp, int64_t prefetch_dist, Tensor lru_state): ssd_split_embeddings_cache_cuda.cu'],['../ssd__split__table__batched__embeddings_8cpp.html#a872136033719ff00d6b05e94e4b1cbab',1,'ssd_cache_populate_actions_cuda(Tensor linear_indices, int64_t total_hash_size, Tensor lxu_cache_state, int64_t time_stamp, int64_t prefetch_dist, Tensor lru_state): ssd_split_embeddings_cache_cuda.cu']]], + ['stacked_5fjagged_5f1d_5fto_5fdense_5fcpu_118',['stacked_jagged_1d_to_dense_cpu',['../namespacefbgemm__gpu.html#a6ac9f6d81bff1b8572a380dbe1af00fb',1,'fbgemm_gpu']]], + ['stacked_5fjagged_5f1d_5fto_5fdense_5fgpu_119',['stacked_jagged_1d_to_dense_gpu',['../namespacefbgemm__gpu.html#adf7f39b1a3dd7c2797fd11e740d6269f',1,'fbgemm_gpu']]], + ['stacked_5fjagged_5f2d_5fto_5fdense_5fbackward_5fcuda_120',['stacked_jagged_2d_to_dense_backward_cuda',['../namespacefbgemm__gpu.html#a442efbf57b46780a07ac4759ac1866ee',1,'fbgemm_gpu']]], + ['stacked_5fjagged_5f2d_5fto_5fdense_5fcpu_121',['stacked_jagged_2d_to_dense_cpu',['../namespacefbgemm__gpu.html#ab45e5e415a8929cbd0021eae37e1d881',1,'fbgemm_gpu']]], + ['stacked_5fjagged_5f2d_5fto_5fdense_5fforward_5fcuda_122',['stacked_jagged_2d_to_dense_forward_cuda',['../namespacefbgemm__gpu.html#a5de1d5c177df840f2fa7ab0cdda2aa02',1,'fbgemm_gpu']]], + ['stacked_5fjagged_5f2d_5fto_5fdense_5fgpu_123',['stacked_jagged_2d_to_dense_gpu',['../namespacefbgemm__gpu.html#aaac575e676d094aba1367e9eaf3489bc',1,'fbgemm_gpu']]], + ['stochastic_5frounding_5finit_124',['stochastic_rounding_init',['../namespacefbgemm__gpu.html#afe523b46c92c9009410f173e4ac434db',1,'fbgemm_gpu']]], + ['stochastic_5frounding_5frand4_125',['stochastic_rounding_rand4',['../namespacefbgemm__gpu.html#af0b19e6751891f43372768335cc3c468',1,'fbgemm_gpu']]], + ['stochastic_5frounding_5fvector_126',['stochastic_rounding_vector',['../namespacefbgemm__gpu.html#aec7be9515265c4db67d205f8a3a39822',1,'fbgemm_gpu::stochastic_rounding_vector(dst_t *output, const Vec4T< src_t > &value, StochasticRoundingRNGState &state, const float2)'],['../namespacefbgemm__gpu.html#a06c37bb32cb18b8846cf689db8ed94fb',1,'fbgemm_gpu::stochastic_rounding_vector(at::Half *output, const Vec4T< at::Half > &value, StochasticRoundingRNGState &state, const float2)'],['../namespacefbgemm__gpu.html#a7d41dbbfc3106c8fd5ff37cefbffbc38',1,'fbgemm_gpu::stochastic_rounding_vector(at::Half *output, const Vec4T< float > &value, StochasticRoundingRNGState &state, const float2)'],['../namespacefbgemm__gpu.html#a3313b5c0af7bd07d6e47253a24a27ce7',1,'fbgemm_gpu::stochastic_rounding_vector(uint8_t *output, const Vec4T< float > &value, StochasticRoundingRNGState &state, const float2 qparams)'],['../namespacefbgemm__gpu.html#a44ed26caaddd852d96ee453ea6cc2e07',1,'fbgemm_gpu::stochastic_rounding_vector(uint8_t *output, const Vec4T< at::Half > &value, StochasticRoundingRNGState &state, const float2 qparams)']]], + ['store_127',['store',['../structfbgemm__gpu_1_1_half4.html#a89967f417dba84846fa95a0f010d8922',1,'fbgemm_gpu::Half4::store()'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a156eebe566e80706636626c60d2d13b0',1,'fbgemm_gpu::Vec4T< float >::store(float *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#ab31e8852ca6760cf83d6356c8c448596',1,'fbgemm_gpu::Vec4T< float >::store(float4 *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#ac1717b1a00b76b3d368982629c5e8287',1,'fbgemm_gpu::Vec4T< float >::store(at::Half *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a8513259b78c1bcc3e849beea82b95edd',1,'fbgemm_gpu::Vec4T< float >::store(at::BFloat16 *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a42f2f5c890748268ece0df580bbafa44',1,'fbgemm_gpu::Vec4T< float >::store(double *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a85854690aa7af9f8006cf54d577d8e77',1,'fbgemm_gpu::Vec4T< float >::store(uint8_t *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#ac1717b1a00b76b3d368982629c5e8287',1,'fbgemm_gpu::Vec4T< at::Half >::store(at::Half *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a8513259b78c1bcc3e849beea82b95edd',1,'fbgemm_gpu::Vec4T< at::Half >::store(at::BFloat16 *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a156eebe566e80706636626c60d2d13b0',1,'fbgemm_gpu::Vec4T< at::Half >::store(float *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a42f2f5c890748268ece0df580bbafa44',1,'fbgemm_gpu::Vec4T< at::Half >::store(double *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a85854690aa7af9f8006cf54d577d8e77',1,'fbgemm_gpu::Vec4T< at::Half >::store(uint8_t *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#ac1717b1a00b76b3d368982629c5e8287',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::store(at::Half *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a8513259b78c1bcc3e849beea82b95edd',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::store(at::BFloat16 *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a156eebe566e80706636626c60d2d13b0',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::store(float *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a42f2f5c890748268ece0df580bbafa44',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::store(double *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a85854690aa7af9f8006cf54d577d8e77',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::store(uint8_t *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a42f2f5c890748268ece0df580bbafa44',1,'fbgemm_gpu::Vec4T< double >::store(double *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a156eebe566e80706636626c60d2d13b0',1,'fbgemm_gpu::Vec4T< double >::store(float *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#ac1717b1a00b76b3d368982629c5e8287',1,'fbgemm_gpu::Vec4T< double >::store(at::Half *p) const'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a8513259b78c1bcc3e849beea82b95edd',1,'fbgemm_gpu::Vec4T< double >::store(at::BFloat16 *p) const'],['../structfbgemm__gpu_1_1_weight_row.html#a2118cba7a45acc1a3d8ea5781badbbe9',1,'fbgemm_gpu::WeightRow::store()'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#a8191536a88223b7249cae8a8cfa97979',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::store(float *output_ptr, int num_valid_outputs=1)'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#ab9651b6b0e85a41131aa086c367d68bd',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::store(at::Half *output_ptr, int num_valid_outputs=1)'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#a0624585ab8592b64edef7a6730938cb9',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::store(at::BFloat16 *output_ptr, const int num_valid_outputs=1)'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#a5ba7b1dad5adec8ae5dc9e4adfe58c38',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::store(uint8_t *output_ptr, int num_valid_outputs=1)'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#ab208ce429674113143ee02d6b9e8a9be',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::store(uint8_t *output_ptr, float2 qparams, int num_valid_outputs=1)'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#ac87524a86f8aa165742c6b793f8fe6aa',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::store(float *output_ptr, float2 qparams, int num_valid_outputs=1)'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#a26ce31b610926ff405b67dc540ff3d95',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::store(at::Half *output_ptr, float2 qparams, int num_valid_outputs=1)'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#a392a5b352be9af9ba86e0cd396e6316a',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::store(at::BFloat16 *output_ptr, float2 qparams, int num_valid_outputs=1)'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a4699624d6b086fa52d88ce1960dc7297',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::store(float *output_ptr, int num_valid_outputs=2)'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a1f0743afcc39c1afeeee6cd9bcdddc35',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::store(at::Half *output_ptr, int num_valid_outputs=2)'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a2c56bff3020a6b803a8310a13b61cfbe',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::store(at::BFloat16 *output_ptr, const int num_valid_outputs=2)'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#ac4e67ed3ba860166333a7805b101490d',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::store(uint8_t *output_ptr, int num_valid_outputs=2)'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a0c562343c84b60da0e5f11ee16e593f2',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::store(uint8_t *output_ptr, float2 qparams, int num_valid_outputs=2)'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a68c214376e86167cbe59755a1caf99a5',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::store(float *output_ptr, float2 qparams, int num_valid_outputs=2)'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a9e2e827bb7e7c608f3acd3953a39e720',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::store(at::Half *output_ptr, float2 qparams, int num_valid_outputs=2)'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a32f2acc26afe1a9cf7d5152567bbd15d',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::store(at::BFloat16 *output_ptr, float2 qparams, int num_valid_outputs=2)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#a75cd31fa56a77c83611b64ddd370a562',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::store(float *output_ptr, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#a81504bf4294b938a3efc8d00acda3b5f',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::store(at::Half *output_ptr, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#a03b4a86f4326d9c24fec2b4dc63439cd',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::store(at::BFloat16 *output_ptr, const int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#ad15c2605b8d982986100c89caa7c0401',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::store(uint8_t *output_ptr, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#a786f9130a8df81af5fc3b0706a1a6545',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::store(uint8_t *output_ptr, float2 qparams, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#afbc2050eefc2350fd0f84db8dd568d14',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::store(float *output_ptr, float2 qparams, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#a12b87408afdd840ed3ae2e1870fa8e2a',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::store(at::Half *output_ptr, float2 qparams, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#a6740fe48ec591c6058b8c5019ca0b599',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::store(at::BFloat16 *output_ptr, float2 qparams, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#a75cd31fa56a77c83611b64ddd370a562',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::store(float *output_ptr, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#a81504bf4294b938a3efc8d00acda3b5f',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::store(at::Half *output_ptr, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#a03b4a86f4326d9c24fec2b4dc63439cd',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::store(at::BFloat16 *output_ptr, const int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#ad15c2605b8d982986100c89caa7c0401',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::store(uint8_t *output_ptr, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#a786f9130a8df81af5fc3b0706a1a6545',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::store(uint8_t *output_ptr, float2 qparams, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#afbc2050eefc2350fd0f84db8dd568d14',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::store(float *output_ptr, float2 qparams, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#a12b87408afdd840ed3ae2e1870fa8e2a',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::store(at::Half *output_ptr, float2 qparams, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#a6740fe48ec591c6058b8c5019ca0b599',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::store(at::BFloat16 *output_ptr, float2 qparams, int num_valid_outputs=4)'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#acf9a6b5f9ac186a75bd50800993e7241',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::store(float *output_ptr, int num_valid_outputs=8)'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#aa231a6e5c1ad91305125e2ba8c6cf773',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::store(at::Half *output_ptr, int num_valid_outputs=8)'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#aa2d60424caff50f6d80adfcd1ab5ba3f',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::store(at::BFloat16 *output_ptr, const int num_valid_outputs=8)'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#a5881b8e1b9ca2c81640bad8e6d0a455a',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::store(uint8_t *output_ptr, int num_valid_outputs=8)'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#aa7c2038d0448a12c5edd87eb31f8b828',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::store(uint8_t *output_ptr, float2 qparams, int num_valid_outputs=8)'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#abfaf6f8618474ccb25d58d723792421d',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::store(float *output_ptr, float2 qparams, int num_valid_outputs=8)'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#aad5d604b72b0f656dbeb5e313ebf63af',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::store(at::Half *output_ptr, float2 qparams, int num_valid_outputs=8)'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#a6dfa84a3eb11e20e68d8d3b401c7d2cf',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::store(at::BFloat16 *output_ptr, float2 qparams, int num_valid_outputs=8)'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#a82b07f279fccc086af2208ca7d6d1a3a',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::store(float *output_ptr, int num_valid_outputs=16)'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#a1f25b384b68cdb93ddd010a86f661460',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::store(at::Half *output_ptr, int num_valid_outputs=16)'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#aa9b6e7a0e81a3a3d049e7c632fec2ad7',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::store(at::BFloat16 *output_ptr, const int num_valid_outputs=16)'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#a0d5c2181816bdbb6e5e4998b3fbba721',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::store(uint8_t *output_ptr, int num_valid_outputs=16)'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#a23eb49aef842e89c0f4403d45df27af9',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::store(uint8_t *output_ptr, float2 qparams, int num_valid_outputs=16)'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#a483f290add1c81ba850fda8c574f68bb',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::store(float *output_ptr, float2 qparams, int num_valid_outputs=16)'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#a9b3adeaa52d595467e06b90520c9708a',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::store(at::Half *output_ptr, float2 qparams, int num_valid_outputs=16)'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#aba368627faa071e57a548a336c7bee6b',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::store(at::BFloat16 *output_ptr, float2 qparams, int num_valid_outputs=16)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#ac85ba1113a076bb8a6b6e39ad26bb85d',1,'fbgemm_gpu::Vec4AccT::store(float4 *ptr)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#a775650f6f2480831282ed0a8746998f6',1,'fbgemm_gpu::Vec4AccT::store(float2 *ptr)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#ae4768b5f85cb93226f4e8e7705a32206',1,'fbgemm_gpu::Vec4AccT::store(uint8_t *ptr)']]], + ['store_5f_128',['store_',['../structfbgemm__gpu_1_1_vec4_acc_t.html#aa05890f2dd90061ad3ff516a30e6c196',1,'fbgemm_gpu::Vec4AccT::store_(const float4 *src, float4 *dst)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#a476bc3df6ed11614b47e7c4b1bb440c6',1,'fbgemm_gpu::Vec4AccT::store_(const float4 *src, float2 *dst)']]], + ['store_5fqparams_129',['store_qparams',['../structfbgemm__gpu_1_1_weight_row.html#a7e20dc1480b5220df335895b7ac6bdd0',1,'fbgemm_gpu::WeightRow']]], + ['store_5fqparams_5fto_5frow_130',['store_qparams_to_row',['../namespacefbgemm__gpu.html#a8afc4c2510a6db3d420fc1025d3ac30b',1,'fbgemm_gpu::store_qparams_to_row(emb_t *ptr, float2 qparams)'],['../namespacefbgemm__gpu.html#af4ec15f5d6826c016c46b5d7cae62d72',1,'fbgemm_gpu::store_qparams_to_row(uint8_t *ptr, float2 qparams)']]], + ['stride_131',['stride',['../classfbgemm__gpu_1_1_tensor_accessor_base.html#a396d81b04ec72f4281d15a02c7840694',1,'fbgemm_gpu::TensorAccessorBase::stride()'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#a396d81b04ec72f4281d15a02c7840694',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::stride()']]], + ['strides_132',['strides',['../classfbgemm__gpu_1_1_tensor_accessor_base.html#af446bd0965fd0586067d176a1630a6c1',1,'fbgemm_gpu::TensorAccessorBase']]], + ['sum_133',['sum',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#a2b4995ca44cb8977ca258395e80a8687',1,'fbgemm_gpu::Vec4StepT< STEP, float >::sum()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#a2b4995ca44cb8977ca258395e80a8687',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::sum()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#a2b4995ca44cb8977ca258395e80a8687',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::sum()']]], + ['sum_5freduce_5fto_5fone_5fdevice_134',['sum_reduce_to_one_device',['../namespacefbgemm__gpu.html#aa7f73354e0c76fbc0584c3250dadc98e',1,'fbgemm_gpu']]], + ['syncwarp_135',['syncwarp',['../namespacefbgemm__gpu.html#ab776b7b9076d17238d502b2746135ace',1,'fbgemm_gpu']]] +]; diff --git a/search/functions_14.js b/search/functions_14.js new file mode 100644 index 000000000..df52d41a9 --- /dev/null +++ b/search/functions_14.js @@ -0,0 +1,37 @@ +var searchData= +[ + ['tbe_5finput_5fcombine_5fcpu_0',['tbe_input_combine_cpu',['../group__input-combine.html#ga4f8f3f8b825c9d7639c1e45e8dc8b689',1,'fbgemm_gpu']]], + ['tbe_5finput_5fcombine_5fwith_5flength_5fcpu_1',['tbe_input_combine_with_length_cpu',['../namespacefbgemm__gpu.html#a56da764643d07d366219d69333e6f9de',1,'fbgemm_gpu']]], + ['tbe_5finput_5fcombine_5fwith_5flength_5fcuda_2',['tbe_input_combine_with_length_cuda',['../namespacefbgemm__gpu.html#ae818a54243bd2ea4c0841088f07ff327',1,'fbgemm_gpu']]], + ['tbe_5finput_5fcombine_5fwith_5flength_5fgpu_3',['tbe_input_combine_with_length_gpu',['../namespacefbgemm__gpu.html#af7db32b23d955e760c7dfb4b29a13ca1',1,'fbgemm_gpu']]], + ['tensor_5fon_5fsame_5fgpu_5fif_5fnot_5foptional_5fcheck_4',['tensor_on_same_gpu_if_not_optional_check',['../sparse__ops__utils_8h.html#a5a8411338d3eef3620c7f5be3803c7cd',1,'sparse_ops_utils.h']]], + ['tensoraccessor_5',['TensorAccessor',['../classfbgemm__gpu_1_1_tensor_accessor.html#a6b681d8fc7f13b4b8d31426ec10a0f11',1,'fbgemm_gpu::TensorAccessor::TensorAccessor()'],['../classfbgemm__gpu_1_1_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html#a55169dff4cc835156c5ccd43240b4c8c',1,'fbgemm_gpu::TensorAccessor< T, 1, PtrTraits, index_t >::TensorAccessor()']]], + ['tensoraccessorbase_6',['TensorAccessorBase',['../classfbgemm__gpu_1_1_tensor_accessor_base.html#ac139dc2b8e88aec4b189a6c41bc135af',1,'fbgemm_gpu::TensorAccessorBase']]], + ['test_7',['TEST',['../embedding__inplace__update__test_8cpp.html#a8eb96d7f557ba896e48fef81f259d7a5',1,'TEST(EmbeddingInplaceUpdateTest, random_update): embedding_inplace_update_test.cpp'],['../cpu__kernel__test_8cpp.html#aa2c7091971cf4fd4bcbb3215ebe612cf',1,'TEST(cpu_kernel_test, csr2csc_test): cpu_kernel_test.cpp'],['../sparse__ops__utils__test_8cpp.html#a9011669ae997bae59aa8f141bd794f11',1,'TEST(sparse_ops_utils_test, undefined_tensors_do_not_trigger): sparse_ops_utils_test.cpp'],['../sparse__ops__utils__test_8cpp.html#a2d4ac7a4fb22c0789d8510d17f3878db',1,'TEST(sparse_ops_utils_test, cpu_tensors_fail): sparse_ops_utils_test.cpp'],['../sparse__ops__utils__test_8cpp.html#adc3b9330a7cac1cf2e07268fe7a6bd17',1,'TEST(sparse_ops_utils_test, gpu_tensors_pass): sparse_ops_utils_test.cpp'],['../sparse__ops__utils__test_8cpp.html#ae888046a03bb3fe0f87d23c4915f6994',1,'TEST(sparse_ops_utils_test, optional_tensor_passes): sparse_ops_utils_test.cpp'],['../tensor__assert__test_8cpp.html#af3ce575ab5810b31aae3455d53faacee',1,'TEST(tensor_assert_test, gpu_asserts): tensor_assert_test.cpp'],['../uvm__cache__miss__emulate__test_8cpp.html#aab721325808448b876b97faee4b751b9',1,'TEST(uvm_cache_miss_emulate_test, no_cache_miss): uvm_cache_miss_emulate_test.cpp'],['../uvm__cache__miss__emulate__test_8cpp.html#acdba631ddc8a5dc4e4ee2c02959d3e14',1,'TEST(uvm_cache_miss_emulate_test, enforced_cache_miss): uvm_cache_miss_emulate_test.cpp']]], + ['test_5fembedding_5finplace_5fupdate_8',['test_embedding_inplace_update',['../embedding__inplace__update__test_8cpp.html#aac82e2990c8f2f7d3957f862975181a0',1,'embedding_inplace_update_test.cpp']]], + ['thrust_5ffind_5fqparams_9',['thrust_find_qparams',['../namespacefbgemm__gpu.html#a6c54f589eee05a58cebd4cf7cf8b1086',1,'fbgemm_gpu::thrust_find_qparams(scalar_t *input_row, int D)'],['../namespacefbgemm__gpu.html#a8145ebe65a5242bd7a3a15de0d69a70b',1,'fbgemm_gpu::thrust_find_qparams(fbgemm_gpu::Vec4T< scalar_t > *input_row, int D)']]], + ['to_5fbfloat16_10',['to_bfloat16',['../namespacefbgemm__gpu.html#a9d1e20705b5c1c16dd554c81b3766b93',1,'fbgemm_gpu']]], + ['to_5fbfloat16_5f16_11',['to_bfloat16_16',['../namespacefbgemm__gpu.html#a3f6b99cce95aa3d297e4b824e577d62d',1,'fbgemm_gpu']]], + ['to_5fbfloat16_5f2_12',['to_bfloat16_2',['../namespacefbgemm__gpu.html#a2b8a7fb1619f338df717ef075fe513e4',1,'fbgemm_gpu']]], + ['to_5fbfloat16_5f4_13',['to_bfloat16_4',['../namespacefbgemm__gpu.html#a7d0d7114d05a683328a782804ef2bef9',1,'fbgemm_gpu']]], + ['to_5fbfloat16_5f8_14',['to_bfloat16_8',['../namespacefbgemm__gpu.html#a74f150a063fed3144f6d99cde2d46069',1,'fbgemm_gpu']]], + ['to_5fhalf_15',['to_half',['../namespacefbgemm__gpu.html#a3e13c4ba1e371f3bcabf7f6f74ac103e',1,'fbgemm_gpu']]], + ['to_5fhalf16_16',['to_half16',['../namespacefbgemm__gpu.html#a776872b9c8f667b7d05aea83e7287d5d',1,'fbgemm_gpu']]], + ['to_5fhalf2_17',['to_half2',['../namespacefbgemm__gpu.html#aaed7807ac8eef0fb786324d5935c4aca',1,'fbgemm_gpu']]], + ['to_5fhalf4_18',['to_half4',['../namespacefbgemm__gpu.html#aee1f23de5e5847146cd821595d1978ae',1,'fbgemm_gpu']]], + ['to_5fhalf8_19',['to_half8',['../namespacefbgemm__gpu.html#a40088f5e88d0985b0c9b08808c40e1dd',1,'fbgemm_gpu']]], + ['torch_5flibrary_5ffragment_20',['TORCH_LIBRARY_FRAGMENT',['../gen__embedding__backward__adagrad__split__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_adagrad_split_cpu.cpp'],['../gen__embedding__backward__adagrad__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__adagrad__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__adam__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_adam_split_unweighted_cuda.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_adam_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__adam__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_adam_split_weighted_cuda.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu'],['../gen__embedding__backward__dense__indice__weights__codegen__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_dense_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__dense__split__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_dense_split_cpu.cpp'],['../gen__embedding__backward__dense__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_dense_split_unweighted_cuda.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_dense_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__dense__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_dense_split_weighted_cuda.cu'],['../gen__embedding__backward__lamb__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_lamb_split_unweighted_cuda.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_lamb_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__lamb__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_lamb_split_weighted_cuda.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_lars_sgd_split_unweighted_cuda.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_lars_sgd_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_lars_sgd_split_weighted_cuda.cu'],['../gen__embedding__backward__none__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_none_split_unweighted_cuda.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_none_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__none__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_none_split_weighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_partial_rowwise_adam_split_unweighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_partial_rowwise_adam_split_weighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_cuda.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_partial_rowwise_lamb_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_split_cpu.cpp'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_with_counter_split_cpu.cpp'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_cuda.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_weighted_adagrad_split_cpu.cpp'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_cuda.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_cuda.cu'],['../gen__embedding__backward__sgd__split__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_sgd_split_cpu.cpp'],['../gen__embedding__backward__sgd__split__unweighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_sgd_split_unweighted_cuda.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_sgd_split_unweighted_nobag_cuda.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_sgd_split_unweighted_vbe_cuda.cu'],['../gen__embedding__backward__sgd__split__weighted__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_sgd_split_weighted_cuda.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_sgd_split_weighted_vbe_cuda.cu'],['../gen__embedding__backward__split__adagrad_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_adagrad.cpp'],['../gen__embedding__backward__split__adagrad_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_adagrad.cpp'],['../gen__embedding__backward__split__adam_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_adam.cpp'],['../gen__embedding__backward__split__adam_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_adam.cpp'],['../gen__embedding__backward__split__approx__rowwise__adagrad_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_approx_rowwise_adagrad.cpp'],['../gen__embedding__backward__split__approx__rowwise__adagrad_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_approx_rowwise_adagrad.cpp'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__counter_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_approx_rowwise_adagrad_with_counter.cpp'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__counter_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_approx_rowwise_adagrad_with_counter.cpp'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_approx_rowwise_adagrad_with_weight_decay.cpp'],['../gen__embedding__backward__split__approx__rowwise__adagrad__with__weight__decay_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_approx_rowwise_adagrad_with_weight_decay.cpp'],['../gen__embedding__backward__split__approx__sgd_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_approx_sgd.cpp'],['../gen__embedding__backward__split__approx__sgd_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_approx_sgd.cpp'],['../gen__embedding__backward__split__indice__weights__codegen__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_indice_weights_codegen_cuda.cu'],['../gen__embedding__backward__split__lamb_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_lamb.cpp'],['../gen__embedding__backward__split__lamb_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_lamb.cpp'],['../gen__embedding__backward__split__lars__sgd_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_lars_sgd.cpp'],['../gen__embedding__backward__split__lars__sgd_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_lars_sgd.cpp'],['../gen__embedding__backward__split__none_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_none.cpp'],['../gen__embedding__backward__split__none_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_none.cpp'],['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_partial_rowwise_adam.cpp'],['../gen__embedding__backward__split__partial__rowwise__adam_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_partial_rowwise_adam.cpp'],['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_partial_rowwise_lamb.cpp'],['../gen__embedding__backward__split__partial__rowwise__lamb_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_partial_rowwise_lamb.cpp'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_rowwise_adagrad.cpp'],['../gen__embedding__backward__split__rowwise__adagrad_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_rowwise_adagrad.cpp'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_rowwise_adagrad_with_counter.cpp'],['../gen__embedding__backward__split__rowwise__adagrad__with__counter_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_rowwise_adagrad_with_counter.cpp'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_rowwise_adagrad_with_weight_decay.cpp'],['../gen__embedding__backward__split__rowwise__adagrad__with__weight__decay_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_rowwise_adagrad_with_weight_decay.cpp'],['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_rowwise_weighted_adagrad.cpp'],['../gen__embedding__backward__split__rowwise__weighted__adagrad_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_rowwise_weighted_adagrad.cpp'],['../gen__embedding__backward__split__sgd_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): gen_embedding_backward_split_sgd.cpp'],['../gen__embedding__backward__split__sgd_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_backward_split_sgd.cpp'],['../gen__embedding__forward__dense__unweighted__codegen__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_dense_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__dense__unweighted__codegen__meta_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_dense_unweighted_codegen_meta.cpp'],['../gen__embedding__forward__dense__weighted__codegen__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_dense_weighted_codegen_cuda.cu'],['../gen__embedding__forward__dense__weighted__codegen__meta_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_dense_weighted_codegen_meta.cpp'],['../gen__embedding__forward__split__unweighted__codegen__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_split_unweighted_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__codegen__meta_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_split_unweighted_codegen_meta.cpp'],['../gen__embedding__forward__split__unweighted__vbe__codegen__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_split_unweighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__unweighted__vbe__codegen__meta_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_split_unweighted_vbe_codegen_meta.cpp'],['../gen__embedding__forward__split__weighted__codegen__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_split_weighted_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__codegen__meta_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_split_weighted_codegen_meta.cpp'],['../gen__embedding__forward__split__weighted__vbe__codegen__cuda_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_split_weighted_vbe_codegen_cuda.cu'],['../gen__embedding__forward__split__weighted__vbe__codegen__meta_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_forward_split_weighted_vbe_codegen_meta.cpp'],['../gen__embedding__optimizer__rowwise__adagrad__split_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): gen_embedding_optimizer_rowwise_adagrad_split.cpp'],['../batch__index__select__dim0__cpu__host_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): batch_index_select_dim0_cpu_host.cpp'],['../batch__index__select__dim0__cpu__host_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): batch_index_select_dim0_cpu_host.cpp'],['../batch__index__select__dim0__host_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): batch_index_select_dim0_host.cpp'],['../batch__index__select__dim0__host_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): batch_index_select_dim0_host.cpp'],['../embedding__backward__dense__host_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): embedding_backward_dense_host.cpp'],['../embedding__backward__dense__host_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): embedding_backward_dense_host.cpp'],['../embedding__bounds__check__host_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): embedding_bounds_check_host.cpp'],['../embedding__bounds__check__host_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): embedding_bounds_check_host.cpp'],['../embedding__bounds__check__host__cpu_8cpp.html#ad1913bdf24279dfcc3932843af149fd0',1,'TORCH_LIBRARY_FRAGMENT(fb, m): embedding_bounds_check_host_cpu.cpp'],['../embedding__bounds__check__host__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): embedding_bounds_check_host_cpu.cpp'],['../embedding__forward__quantized__host_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): embedding_forward_quantized_host.cpp'],['../embedding__forward__quantized__host__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): embedding_forward_quantized_host_cpu.cpp'],['../embedding__optimizer__split__host__template_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): embedding_optimizer_split_host_template.cpp'],['../embedding__inplace__update__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): embedding_inplace_update_cpu.cpp'],['../embedding__inplace__update__gpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): embedding_inplace_update_gpu.cpp'],['../input__combine__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): input_combine_cpu.cpp'],['../jagged__tensor__ops__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): jagged_tensor_ops_cpu.cpp'],['../layout__transform__ops__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): layout_transform_ops_cpu.cpp'],['../namespacefbgemm__gpu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'fbgemm_gpu::TORCH_LIBRARY_FRAGMENT()'],['../merge__pooled__embedding__ops__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): merge_pooled_embedding_ops_cpu.cpp'],['../merge__pooled__embedding__ops__gpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): merge_pooled_embedding_ops_gpu.cpp'],['../permute__pooled__embedding__ops__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): permute_pooled_embedding_ops_cpu.cpp'],['../permute__pooled__embedding__ops__gpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): permute_pooled_embedding_ops_gpu.cpp'],['../permute__pooled__embedding__ops__split__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): permute_pooled_embedding_ops_split_cpu.cpp'],['../permute__pooled__embedding__ops__split__gpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): permute_pooled_embedding_ops_split_gpu.cpp'],['../quantize__ops__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): quantize_ops_cpu.cpp'],['../sparse__ops__cpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): sparse_ops_cpu.cpp'],['../sparse__ops__gpu_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): sparse_ops_gpu.cpp'],['../sparse__zipf_8cu.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): sparse_zipf.cu'],['../split__embeddings__utils_8cpp.html#af53d2b0e9d8aeadd7d5094bd03ea25cc',1,'TORCH_LIBRARY_FRAGMENT(fbgemm, m): split_embeddings_utils.cpp']]], + ['torch_5flibrary_5fimpl_21',['TORCH_LIBRARY_IMPL',['../namespacefbgemm__gpu.html#a257a9d9e0a71b3a1299af6ef9c6c3a78',1,'fbgemm_gpu::TORCH_LIBRARY_IMPL()'],['../jagged__tensor__ops__autograd_8cpp.html#a89761ba0ed893bf88bdfdd1f6d15bc65',1,'TORCH_LIBRARY_IMPL(fbgemm, Autograd, m): jagged_tensor_ops_autograd.cpp'],['../jagged__tensor__ops__autograd_8cpp.html#a5eca359a14102dd9fcab1f8e80594472',1,'TORCH_LIBRARY_IMPL(fbgemm, CompositeImplicitAutograd, m): jagged_tensor_ops_autograd.cpp'],['../jagged__tensor__ops__cpu_8cpp.html#a26b96ceaa00c9be7dbba99ca0b772a58',1,'TORCH_LIBRARY_IMPL(fbgemm, CPU, m): jagged_tensor_ops_cpu.cpp'],['../jagged__tensor__ops__cpu_8cpp.html#aa138561d0eb99d73b2bf9586b84e7c46',1,'TORCH_LIBRARY_IMPL(fbgemm, CompositeExplicitAutograd, m): jagged_tensor_ops_cpu.cpp'],['../jagged__tensor__ops__meta_8cpp.html#a5a1490b57e6f9b7f7f7b12c0359a2f91',1,'TORCH_LIBRARY_IMPL(fbgemm, Meta, m): jagged_tensor_ops_meta.cpp'],['../layout__transform__ops__cpu_8cpp.html#a26b96ceaa00c9be7dbba99ca0b772a58',1,'TORCH_LIBRARY_IMPL(fbgemm, CPU, m): layout_transform_ops_cpu.cpp'],['../layout__transform__ops__gpu_8cpp.html#a257a9d9e0a71b3a1299af6ef9c6c3a78',1,'TORCH_LIBRARY_IMPL(fbgemm, CUDA, m): layout_transform_ops_gpu.cpp'],['../quantize__ops__cpu_8cpp.html#a26b96ceaa00c9be7dbba99ca0b772a58',1,'TORCH_LIBRARY_IMPL(fbgemm, CPU, m): quantize_ops_cpu.cpp'],['../quantize__ops__meta_8cpp.html#a5a1490b57e6f9b7f7f7b12c0359a2f91',1,'TORCH_LIBRARY_IMPL(fbgemm, Meta, m): quantize_ops_meta.cpp'],['../sparse__ops__cpu_8cpp.html#a26b96ceaa00c9be7dbba99ca0b772a58',1,'TORCH_LIBRARY_IMPL(fbgemm, CPU, m): sparse_ops_cpu.cpp'],['../sparse__ops__cpu_8cpp.html#a89761ba0ed893bf88bdfdd1f6d15bc65',1,'TORCH_LIBRARY_IMPL(fbgemm, Autograd, m): sparse_ops_cpu.cpp'],['../sparse__ops__cpu_8cpp.html#af0fdef89a7a61f1f510ed4bb5f6d5398',1,'TORCH_LIBRARY_IMPL(fbgemm, AutogradCPU, m): sparse_ops_cpu.cpp'],['../sparse__ops__cpu_8cpp.html#a5a1490b57e6f9b7f7f7b12c0359a2f91',1,'TORCH_LIBRARY_IMPL(fbgemm, Meta, m): sparse_ops_cpu.cpp'],['../sparse__ops__gpu_8cpp.html#a257a9d9e0a71b3a1299af6ef9c6c3a78',1,'TORCH_LIBRARY_IMPL(fbgemm, CUDA, m): sparse_ops_gpu.cpp'],['../sparse__ops__gpu_8cpp.html#a5a1490b57e6f9b7f7f7b12c0359a2f91',1,'TORCH_LIBRARY_IMPL(fbgemm, Meta, m): sparse_ops_gpu.cpp'],['../sparse__ops__gpu_8cpp.html#a8fd406590cd83f4dec4a63c7c1b9ce78',1,'TORCH_LIBRARY_IMPL(fbgemm, AutogradCUDA, m): sparse_ops_gpu.cpp'],['../sparse__ops__meta_8cpp.html#a5a1490b57e6f9b7f7f7b12c0359a2f91',1,'TORCH_LIBRARY_IMPL(fbgemm, Meta, m): sparse_ops_meta.cpp'],['../split__embeddings__utils_8cpp.html#a5a1490b57e6f9b7f7f7b12c0359a2f91',1,'TORCH_LIBRARY_IMPL(fbgemm, Meta, m): split_embeddings_utils.cpp']]], + ['torch_5ftensor_5fdevice_5fname_22',['torch_tensor_device_name',['../sparse__ops__utils_8h.html#a535403fdc5c523b45f0d56d657e17f7b',1,'torch_tensor_device_name(const at::Tensor &ten): sparse_ops_utils.h'],['../sparse__ops__utils_8h.html#a319c921d3abe8bdb14140b45afe9afdb',1,'torch_tensor_device_name(const c10::optional< at::Tensor > &ten): sparse_ops_utils.h']]], + ['torch_5ftensor_5fempty_5for_5fon_5fcpu_5fcheck_23',['torch_tensor_empty_or_on_cpu_check',['../sparse__ops__utils_8h.html#a6328f240dd58293d0349471dca28797e',1,'torch_tensor_empty_or_on_cpu_check(const at::Tensor &ten): sparse_ops_utils.h'],['../sparse__ops__utils_8h.html#afc4520e447e8ad48a316af75860d84ae',1,'torch_tensor_empty_or_on_cpu_check(const c10::optional< at::Tensor > &ten): sparse_ops_utils.h']]], + ['torch_5ftensor_5fempty_5for_5fon_5fcuda_5fgpu_5fcheck_24',['torch_tensor_empty_or_on_cuda_gpu_check',['../sparse__ops__utils_8h.html#abb9778e9fb75a70593c27e53dca268cd',1,'torch_tensor_empty_or_on_cuda_gpu_check(const at::Tensor &ten): sparse_ops_utils.h'],['../sparse__ops__utils_8h.html#aac863615b6eba91282fcf07b5e9a5460',1,'torch_tensor_empty_or_on_cuda_gpu_check(const c10::optional< at::Tensor > &ten): sparse_ops_utils.h']]], + ['torch_5ftensor_5fon_5fcpu_5fcheck_25',['torch_tensor_on_cpu_check',['../sparse__ops__utils_8h.html#ad971d56f6b82b6c62a2d6fed276b0463',1,'torch_tensor_on_cpu_check(const at::Tensor &ten): sparse_ops_utils.h'],['../sparse__ops__utils_8h.html#af4afd1e331412cf092a70d0fd816aed8',1,'torch_tensor_on_cpu_check(const c10::optional< at::Tensor > &ten): sparse_ops_utils.h']]], + ['torch_5ftensor_5fon_5fcuda_5fgpu_5fcheck_26',['torch_tensor_on_cuda_gpu_check',['../sparse__ops__utils_8h.html#a5568d44e6066339da1326798f9637b16',1,'torch_tensor_on_cuda_gpu_check(const at::Tensor &ten): sparse_ops_utils.h'],['../sparse__ops__utils_8h.html#a99211623695fce2a359b74a5823b58b8',1,'torch_tensor_on_cuda_gpu_check(const c10::optional< at::Tensor > &ten): sparse_ops_utils.h']]], + ['torch_5ftensor_5fon_5fsame_5fdevice_5fcheck_27',['torch_tensor_on_same_device_check',['../sparse__ops__utils_8h.html#a5683dd4c2143c3c0ba0eeb80fd5223f0',1,'torch_tensor_on_same_device_check(const at::Tensor &ten1, const at::Tensor &ten2): sparse_ops_utils.h'],['../sparse__ops__utils_8h.html#ac60c66ce5a4058e4906907960f82f1be',1,'torch_tensor_on_same_device_check(const at::Tensor &ten1, const c10::optional< at::Tensor > &ten2): sparse_ops_utils.h']]], + ['torch_5ftensor_5fundefined_28',['torch_tensor_undefined',['../sparse__ops__utils_8h.html#ab583553d9bf8ca92fadb8a81ffd40cd8',1,'torch_tensor_undefined(const at::Tensor &ten): sparse_ops_utils.h'],['../sparse__ops__utils_8h.html#a5e916ca6a05a17d36e5341d929cc18e0',1,'torch_tensor_undefined(const c10::optional< at::Tensor > &ten): sparse_ops_utils.h']]], + ['transpose_29',['transpose',['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor.html#aa4aba7637a10c7b8b839ef27952e855d',1,'fbgemm_gpu::GenericPackedTensorAccessor::transpose()'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html#a0ed7d1e6f585332c781fc568e1fad1ac',1,'fbgemm_gpu::GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >::transpose()']]], + ['transpose_5fembedding_5finput_30',['transpose_embedding_input',['../split__embeddings__utils_8cuh.html#a508f832d3fec529868cbb1f9fa9defc8',1,'transpose_embedding_input(at::Tensor hash_size_cumsum, int64_t total_hash_size_bits, at::Tensor indices, at::Tensor offsets, bool nobag=false, const c10::optional< at::Tensor > &vbe_b_t_map=c10::optional< at::Tensor >(), const int64_t info_B_num_bits=26, const int64_t info_B_mask=0x2FFFFFF, const int64_t total_unique_indices=-1, const bool is_index_select=false, const c10::optional< at::Tensor > &total_L_offsets=c10::optional< at::Tensor >(), const int64_t fixed_L_per_warp=0, const int64_t num_warps_per_feature=0): split_embeddings_utils.cuh'],['../transpose__embedding__input_8cu.html#a569a769e3233130cce363d9ae151bd26',1,'transpose_embedding_input(Tensor hash_size_cumsum, int64_t total_hash_size_bits, Tensor indices, Tensor offsets, bool nobag, const c10::optional< Tensor > &vbe_b_t_map, const int64_t info_B_num_bits, const int64_t info_B_mask, const int64_t total_unique_indices, const bool is_index_select, const c10::optional< Tensor > &total_L_offsets, const int64_t fixed_L_per_warp, const int64_t num_warps_per_feature): transpose_embedding_input.cu']]], + ['trapz_5fkernel_31',['trapz_kernel',['../namespacefbgemm__gpu.html#a45142e19fe831c9d085bb097b7d946b2',1,'fbgemm_gpu']]], + ['true_32',['true',['../gen__embedding__backward__split__grad_8cu.html#af0ccb06b8169682c123d1399ed8e1869',1,'true(): gen_embedding_backward_split_grad.cu'],['../namespacenbit.html#ae298c42e84018c608c72200f61270827',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a9233b0f37aec7890155371e3f1f8a4c6',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#ad461b37bcc67ce85965ea3d63318b609',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a6610e53a686bcaa7c0c055493223b286',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#aa0e536c5986677aa5c753d497c9ec6ea',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a744a48f6ba12a807eed65323fac0d7b9',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#aa7f9e825cb23814721fa128e75fd54df',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#af5e4b89707ccb6db711f4b214120f6d4',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#ae950ec6b1a6c8e70896ceea8585e8a94',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a80b1856aa5c50bef02b6cfc6e07a738f',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a5190453e12b3ae3d90ccbad2d0fd3366',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#aeab80be016250076834edd018371fadc',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#ac4473fe74a275df878cef6094b97142f',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a5e7304badb9669f2af28007bc9faa533',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a000a2e8569876d491d4d9578f5bca2fb',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a1b561270c0c573adbb9b099b20a3ca71',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#aa4e2b761fd2635bd5d972c84f9e28837',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#aea0485b6b1bbf758999bd85f6affc052',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#ab843cff102b60ffbfb639c2371b90f7b',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a085775b780406668fe81c55a30eb3098',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a5614c839b9baa44dd6962fe11a148918',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#af580fa47263724bff70ce910764bea41',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#acb54005a5872970a6721deca8ff5cd99',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a7d2686b58c584f889807ad3902056eac',1,'nbit::true(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a06d07c66722a850f758f54932d3dbe17',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a1aa60c6099666e18389fa1e982910986',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a8de160ae737c50e86160493247817870',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aa74dcf7a765d22c0b1ec49310c9a04b3',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a5649e552b4b7bb69095114018ba395fb',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a9ec2bf37e5db917feed838745ed81985',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a16cf98f36e41cdcacdb6dabac0b258e0',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#ab6f0a4b5648537896b38264e4d38f9aa',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a1e7e3a44299ea276cb2e5f5082977777',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a3d5bd72dd7f6e6c6b0a50b2070e74f45',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#af605abd85d3cc9e6dca40ea687104f6e',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a9c2ae1d1bfa19b2caadbc8e76c32697c',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa41a6064cb3571ecd43c9da816216785',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a66d27435490ba7673e7362fca9cc8f7e',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ad88bb49652d4d156c75abb8ca2419542',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a0a28fe8dcfa38da6241b67d3ec3e4ff2',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a09d02507a5cf390975fafa6a5c7096e8',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aa93f293dcfd38afcd57776f33ceb8490',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ab59b0abf8963d48e63c90334daea4fc5',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ab80c4590dcdff94d23d4f89f1c7e0039',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#addc9e8fb4cd569b143bff818ca6e068b',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aca3f7571841f3f5e46e703a210f5ef3d',1,'true(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#afeba51154f1a22327b47305480f43671',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a4a3bc2db616d7f8f845d8e0cd092fd56',1,'true(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['two_5fto_5fe_33',['two_to_e',['../verify__fp16__stochastic__benchmark_8cu.html#ab29b4915253bcafe11f5d95cfb227c0b',1,'verify_fp16_stochastic_benchmark.cu']]] +]; diff --git a/search/functions_15.js b/search/functions_15.js new file mode 100644 index 000000000..07ba50396 --- /dev/null +++ b/search/functions_15.js @@ -0,0 +1,12 @@ +var searchData= +[ + ['unpack_5fsegments_5fcuda_5fkernel_0',['unpack_segments_cuda_kernel',['../namespacefbgemm__gpu.html#a0ca17769ee2a4593b447a78e3d3fe429',1,'fbgemm_gpu']]], + ['unpadded_5frow_5fsize_5fin_5fbytes_1',['unpadded_row_size_in_bytes',['../namespacenbit.html#a7654c0df9e54aa58c35fe39c53130cbc',1,'nbit']]], + ['uvm_5fcuda_5fmem_5fadvise_2',['uvm_cuda_mem_advise',['../group__cumem-utils.html#gae8c724e90d31245756fc4b0d975f9370',1,'fbgemm_gpu']]], + ['uvm_5fcuda_5fmem_5fprefetch_5fasync_3',['uvm_cuda_mem_prefetch_async',['../group__cumem-utils.html#gaf060db44e71e3419df6e596614ef2081',1,'fbgemm_gpu']]], + ['uvm_5fmem_5fadvice_5fdont_5ffork_4',['uvm_mem_advice_dont_fork',['../group__cumem-utils.html#ga01301ad686f7570c21e81c122d2c7af8',1,'fbgemm_gpu']]], + ['uvm_5fstorage_5',['uvm_storage',['../group__cumem-utils.html#ga05bf2c435c434904ca454c6992861cb6',1,'fbgemm_gpu']]], + ['uvm_5fto_5fcpu_6',['uvm_to_cpu',['../group__cumem-utils.html#gab5a3dab831988b1ce368ccc545b75b48',1,'fbgemm_gpu']]], + ['uvm_5fto_5fcpu_5fclone_7',['uvm_to_cpu_clone',['../group__cumem-utils.html#ga161495e682d9eac3701dca87469930db',1,'fbgemm_gpu']]], + ['uvm_5fto_5fdevice_8',['uvm_to_device',['../group__cumem-utils.html#gaebfedcf8e6017a6d4f6fb16b52c4c04e',1,'fbgemm_gpu']]] +]; diff --git a/search/functions_16.js b/search/functions_16.js new file mode 100644 index 000000000..b822fdfbd --- /dev/null +++ b/search/functions_16.js @@ -0,0 +1,12 @@ +var searchData= +[ + ['vec4_5facc_0',['vec4_acc',['../namespacefbgemm__gpu.html#ab2a027e4907e39797b913faa6b4e7270',1,'fbgemm_gpu']]], + ['vec4_5fmax_1',['vec4_max',['../namespacefbgemm__gpu.html#a635410cfe229b71efb90199b72107f86',1,'fbgemm_gpu']]], + ['vec4_5fmin_2',['vec4_min',['../namespacefbgemm__gpu.html#ae8a02a5464fb9156400157b45a947c58',1,'fbgemm_gpu']]], + ['vec4acct_3',['Vec4AccT',['../structfbgemm__gpu_1_1_vec4_acc_t.html#a7d2508ce413d52826f32884f52ad2f90',1,'fbgemm_gpu::Vec4AccT']]], + ['vec4stept_4',['Vec4StepT',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#a6d2826b97c8d5f17a31ed7e7854615ad',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >']]], + ['vec4t_5',['Vec4T',['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a2cd51fbd0d3886a28acea0b4f47ca118',1,'fbgemm_gpu::Vec4T< float >::Vec4T()'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#ae39dfa9a228f8ce23816438c9bdab827',1,'fbgemm_gpu::Vec4T< float >::Vec4T(const float *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#adadc08c2f27a9f6dfa8993ec8948cc65',1,'fbgemm_gpu::Vec4T< float >::Vec4T(const double *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a5567e55ab954640ee5bb6204c4fcf75b',1,'fbgemm_gpu::Vec4T< float >::Vec4T(const at::Half *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a6e80eaeff7fa50dc31b3426b7cbdf919',1,'fbgemm_gpu::Vec4T< float >::Vec4T(const at::BFloat16 *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a2cd51fbd0d3886a28acea0b4f47ca118',1,'fbgemm_gpu::Vec4T< at::Half >::Vec4T()'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a5567e55ab954640ee5bb6204c4fcf75b',1,'fbgemm_gpu::Vec4T< at::Half >::Vec4T(const at::Half *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a6e80eaeff7fa50dc31b3426b7cbdf919',1,'fbgemm_gpu::Vec4T< at::Half >::Vec4T(const at::BFloat16 *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#ae39dfa9a228f8ce23816438c9bdab827',1,'fbgemm_gpu::Vec4T< at::Half >::Vec4T(const float *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#adadc08c2f27a9f6dfa8993ec8948cc65',1,'fbgemm_gpu::Vec4T< at::Half >::Vec4T(const double *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a2cd51fbd0d3886a28acea0b4f47ca118',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::Vec4T()'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a6e80eaeff7fa50dc31b3426b7cbdf919',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::Vec4T(const at::BFloat16 *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a5567e55ab954640ee5bb6204c4fcf75b',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::Vec4T(const at::Half *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#ae39dfa9a228f8ce23816438c9bdab827',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::Vec4T(const float *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#adadc08c2f27a9f6dfa8993ec8948cc65',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::Vec4T(const double *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a2cd51fbd0d3886a28acea0b4f47ca118',1,'fbgemm_gpu::Vec4T< double >::Vec4T()'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a5567e55ab954640ee5bb6204c4fcf75b',1,'fbgemm_gpu::Vec4T< double >::Vec4T(const at::Half *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a6e80eaeff7fa50dc31b3426b7cbdf919',1,'fbgemm_gpu::Vec4T< double >::Vec4T(const at::BFloat16 *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#ae39dfa9a228f8ce23816438c9bdab827',1,'fbgemm_gpu::Vec4T< double >::Vec4T(const float *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#adadc08c2f27a9f6dfa8993ec8948cc65',1,'fbgemm_gpu::Vec4T< double >::Vec4T(const double *p)']]], + ['vec_5fcopy_5fwith_5fimplicit_5ftype_5fcast_6',['vec_copy_with_implicit_type_cast',['../namespacefbgemm__gpu.html#a8c639f9912105390e4083332e01ecc57',1,'fbgemm_gpu']]], + ['vec_5fwidth_7',['VEC_WIDTH',['../namespacefbgemm__gpu.html#a14fea42ceabd6ac042ad0d2fe5452762',1,'fbgemm_gpu::VEC_WIDTH(combined_indices, indices_addrs[list_id], src_idx, indices_start+src_idx, indices_end - indices_start)'],['../namespacefbgemm__gpu.html#a5aef253d76748f681c0e5d7e1620c8c9',1,'fbgemm_gpu::VEC_WIDTH(combined_lengths, lengths_addrs[list_id], src_idx, lengths_start+src_idx, lengths_end - lengths_start)']]], + ['vecnt_8',['VecNT',['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#af44b6695d2ac77093130f394c322417d',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::VecNT()'],['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#ac774386ebb8ac7021a221b0d32041e40',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::VecNT(float a)'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#af44b6695d2ac77093130f394c322417d',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::VecNT()'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a762e9c277918a40b3e1577984507b77d',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::VecNT(half2 a)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#af44b6695d2ac77093130f394c322417d',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::VecNT()'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#ae4b5f2ee834300f0c91a1e1f247b56a5',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::VecNT(uint32_t v, const int exp_bits, const int exp_bias)'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#af44b6695d2ac77093130f394c322417d',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::VecNT()'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#af7d39695d99328f4f6e8faf36a115e94',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::VecNT(uint32_t v, half2 shift_scale)'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#af44b6695d2ac77093130f394c322417d',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::VecNT()'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#af7d39695d99328f4f6e8faf36a115e94',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::VecNT(uint32_t v, half2 shift_scale)'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#af44b6695d2ac77093130f394c322417d',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::VecNT()'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#af7d39695d99328f4f6e8faf36a115e94',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::VecNT(uint32_t v, half2 shift_scale)']]] +]; diff --git a/search/functions_17.js b/search/functions_17.js new file mode 100644 index 000000000..bbc93a10d --- /dev/null +++ b/search/functions_17.js @@ -0,0 +1,14 @@ +var searchData= +[ + ['warp_5fcopy_5fto_0',['warp_copy_to',['../structfbgemm__gpu_1_1_weight_row.html#a4a0da3213c0d4a99586cbe6e6ec72107',1,'fbgemm_gpu::WeightRow']]], + ['warp_5fevict_1',['warp_evict',['../structfbgemm__gpu_1_1_weight_row.html#ae00ddf1640cea584b79618dfd69d91d2',1,'fbgemm_gpu::WeightRow']]], + ['warp_5ffind_5fqparams_2',['warp_find_qparams',['../namespacefbgemm__gpu.html#a78a26de691da2f45a0e4ddaeda75009d',1,'fbgemm_gpu']]], + ['warp_5freduce_5fmax_3',['warp_reduce_max',['../namespacefbgemm__gpu.html#acddba9c219634f979df1c8b943ac5e88',1,'fbgemm_gpu']]], + ['warp_5freduce_5fmin_4',['warp_reduce_min',['../namespacefbgemm__gpu.html#af554571b877e978f495835af1920f4fb',1,'fbgemm_gpu']]], + ['warpbitonicmergele16_5',['warpBitonicMergeLE16',['../namespacefbgemm__gpu.html#a9bd92b10074adc4fc58e4671a1d1d576',1,'fbgemm_gpu']]], + ['warpreduceallsum_6',['warpReduceAllSum',['../namespacefbgemm__gpu.html#ad47dc8c3cfd941ea7a92b1cb677abf8e',1,'fbgemm_gpu']]], + ['weighted_5fsum_7',['weighted_sum',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#aa7e031196d379ec4120ba58cd6b48024',1,'fbgemm_gpu::Vec4StepT< STEP, float >::weighted_sum()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#aa7e031196d379ec4120ba58cd6b48024',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::weighted_sum()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#aa7e031196d379ec4120ba58cd6b48024',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::weighted_sum()']]], + ['weightrow_8',['WeightRow',['../structfbgemm__gpu_1_1_weight_row.html#acb13973152d6d76389dafdf6e69e6793',1,'fbgemm_gpu::WeightRow']]], + ['while_9',['while',['../namespacefbgemm__gpu.html#a44128eca539acfe55bdf792616e8b5b6',1,'fbgemm_gpu']]], + ['write_5floop_5fsmall_5fls_10',['write_loop_small_Ls',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a784fce39007138a17294839676673bde',1,'write_loop_small_Ls(long *const smem, uint32_t *const write_idx, uint32_t *const bag_boundary, int32_t *const next_boundary, uint32_t *const L, Vec4StepT< STEP, emb_t > *const accumulator, const uint32_t params_offset, const uint32_t l, const bool process_d, const bool mean_pooling): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a784fce39007138a17294839676673bde',1,'write_loop_small_Ls(long *const smem, uint32_t *const write_idx, uint32_t *const bag_boundary, int32_t *const next_boundary, uint32_t *const L, Vec4StepT< STEP, emb_t > *const accumulator, const uint32_t params_offset, const uint32_t l, const bool process_d, const bool mean_pooling): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a784fce39007138a17294839676673bde',1,'write_loop_small_Ls(long *const smem, uint32_t *const write_idx, uint32_t *const bag_boundary, int32_t *const next_boundary, uint32_t *const L, Vec4StepT< STEP, emb_t > *const accumulator, const uint32_t params_offset, const uint32_t l, const bool process_d, const bool mean_pooling): embedding_forward_split_kernel_v2_template.cu']]] +]; diff --git a/search/functions_18.js b/search/functions_18.js new file mode 100644 index 000000000..6e78def8d --- /dev/null +++ b/search/functions_18.js @@ -0,0 +1,5 @@ +var searchData= +[ + ['zipf_5fcuda_0',['zipf_cuda',['../namespacefbgemm__gpu.html#a957e5dced6114b32a6d2e5e62011adbf',1,'fbgemm_gpu']]], + ['zipf_5fkernel_1',['zipf_kernel',['../namespacefbgemm__gpu.html#a6991817ca1213e7cc0eba3bad689c03a',1,'fbgemm_gpu']]] +]; diff --git a/search/functions_19.js b/search/functions_19.js new file mode 100644 index 000000000..28e447783 --- /dev/null +++ b/search/functions_19.js @@ -0,0 +1,5 @@ +var searchData= +[ + ['_7ehypercompressedsparsecolumn_0',['~HyperCompressedSparseColumn',['../structinternal_1_1_hyper_compressed_sparse_column.html#a60d5f8ac0716350bb51bcf02ed10aaeb',1,'internal::HyperCompressedSparseColumn']]], + ['_7einitializer_1',['~Initializer',['../classssd_1_1_initializer.html#a7a69aed99981539d9a2c0ee85459b4b6',1,'ssd::Initializer']]] +]; diff --git a/search/functions_2.js b/search/functions_2.js index 9659821c5..ed60e090f 100644 --- a/search/functions_2.js +++ b/search/functions_2.js @@ -1,4 +1,28 @@ var searchData= [ - ['expand_5finto_5fjagged_5fpermute_5fcuda_0',['expand_into_jagged_permute_cuda',['../group__sparse-data-cuda.html#ga2402de1c0102b21af5f2bd5a50d30309',1,'fbgemm_gpu']]] + ['backward_0',['backward',['../classfbgemm__gpu_1_1_permute_pooled_embs_function.html#ac7ddba5222bfda33f8a498f8394349bf',1,'fbgemm_gpu::PermutePooledEmbsFunction::backward()'],['../classfbgemm__gpu_1_1_permute_pooled_embs_function_split.html#ad62a42e85be3aa7f972677a4f7b710f9',1,'fbgemm_gpu::PermutePooledEmbsFunctionSplit::backward()']]], + ['ballot_5fsync_1',['ballot_sync',['../namespacefbgemm__gpu.html#ac9ef3cbe68285c5559d30c5157131e29',1,'fbgemm_gpu']]], + ['batch_5fauc_2',['batch_auc',['../namespacefbgemm__gpu.html#abeeb6bd4d39a0e534db2213258704285',1,'fbgemm_gpu']]], + ['batch_5findex_5fselect_5fdim0_5fcodegen_5fbackward_5fcuda_3',['batch_index_select_dim0_codegen_backward_cuda',['../gen__batch__index__select__dim0__backward__codegen__cuda_8cu.html#a5709eebbefa399282269508003e47e25',1,'batch_index_select_dim0_codegen_backward_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const int64_t max_segment_length_per_warp, const Tensor &grad_offsets, const Tensor &total_L_offsets, const int32_t fixed_L_per_warp, const int32_t num_warps_per_feature, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_codegen_cuda.cu'],['../batch__index__select__dim0__host_8cpp.html#a5709eebbefa399282269508003e47e25',1,'batch_index_select_dim0_codegen_backward_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &hash_size_cumsum, const int64_t total_hash_size_bits, const Tensor &indices, const int64_t max_segment_length_per_warp, const Tensor &grad_offsets, const Tensor &total_L_offsets, const int32_t fixed_L_per_warp, const int32_t num_warps_per_feature, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_codegen_cuda.cu']]], + ['batch_5findex_5fselect_5fdim0_5fcodegen_5fforward_5fcuda_4',['batch_index_select_dim0_codegen_forward_cuda',['../gen__batch__index__select__dim0__forward__codegen__cuda_8cu.html#a5951ed801e11a01c29c7bbfb648ee230',1,'batch_index_select_dim0_codegen_forward_cuda(const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const int64_t output_dtype, const Tensor &output_offsets, const Tensor &total_L_offsets, const int64_t output_size, const int32_t fixed_L_per_warp, const int32_t num_warps_per_feature, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_forward_codegen_cuda.cu'],['../batch__index__select__dim0__host_8cpp.html#a5951ed801e11a01c29c7bbfb648ee230',1,'batch_index_select_dim0_codegen_forward_cuda(const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const int64_t output_dtype, const Tensor &output_offsets, const Tensor &total_L_offsets, const int64_t output_size, const int32_t fixed_L_per_warp, const int32_t num_warps_per_feature, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_forward_codegen_cuda.cu']]], + ['batch_5findex_5fselect_5fdim0_5fcpu_5',['batch_index_select_dim0_cpu',['../batch__index__select__dim0__cpu__host_8cpp.html#aa719f2231fb791074324f6bbeace9d0c',1,'batch_index_select_dim0_cpu_host.cpp']]], + ['batch_5findex_5fselect_5fdim0_5fgpu_6',['batch_index_select_dim0_gpu',['../batch__index__select__dim0__host_8cpp.html#a5bad7a4ddb5cf6144ad19b6296ef585c',1,'batch_index_select_dim0_host.cpp']]], + ['batched_5fdense_5fvec_5fjagged_5f2d_5fmul_7',['batched_dense_vec_jagged_2d_mul',['../group__jagged-tensor-ops-cpu.html#ga67afdd148d57be07278c9cb088b5ff4b',1,'fbgemm_gpu']]], + ['batched_5fdense_5fvec_5fjagged_5f2d_5fmul_5fbackward_8',['batched_dense_vec_jagged_2d_mul_backward',['../namespacefbgemm__gpu.html#ae815e5156f29e106f0fcb6054d386afa',1,'fbgemm_gpu']]], + ['batched_5fdense_5fvec_5fjagged_5f2d_5fmul_5fbackward_5fmeta_9',['batched_dense_vec_jagged_2d_mul_backward_meta',['../namespacefbgemm__gpu.html#af5324c97be6dc5aecbc40e4e3244646f',1,'fbgemm_gpu']]], + ['batched_5fdense_5fvec_5fjagged_5f2d_5fmul_5fforward_10',['batched_dense_vec_jagged_2d_mul_forward',['../namespacefbgemm__gpu.html#ac3080e0008d5cdd9f1f32b33e38aee95',1,'fbgemm_gpu']]], + ['batched_5fdense_5fvec_5fjagged_5f2d_5fmul_5fforward_5fmeta_11',['batched_dense_vec_jagged_2d_mul_forward_meta',['../namespacefbgemm__gpu.html#a399af8be70030a7aeaedbdf546efe61a',1,'fbgemm_gpu']]], + ['batched_5funary_5fembeddings_5fbackward_5fcuda_12',['batched_unary_embeddings_backward_cuda',['../namespacefbgemm__gpu.html#a0e4965515624f44fcd114ff1e5ff0998',1,'fbgemm_gpu']]], + ['batched_5funary_5fembeddings_5fforward_5fcpu_13',['batched_unary_embeddings_forward_cpu',['../namespacefbgemm__gpu.html#a96db75aa5b2617976c2937ab051b737e',1,'fbgemm_gpu']]], + ['batched_5funary_5fembeddings_5fforward_5fcuda_14',['batched_unary_embeddings_forward_cuda',['../namespacefbgemm__gpu.html#a9895cf76445e7258f2464bb037d2c54c',1,'fbgemm_gpu']]], + ['benchmark_5ffunction_15',['benchmark_function',['../bench__utils_8cuh.html#a8b8729bf92a232e1ff3403ebe7089fdd',1,'bench_utils.cuh']]], + ['bfloat16quantizedtofloat_5fref_16',['BFloat16QuantizedToFloat_ref',['../namespacefbgemm__gpu.html#a0f1d1afe56f116552e1ca9759e6e0fcc',1,'fbgemm_gpu']]], + ['binary_5fsearch_5frange_17',['binary_search_range',['../namespacefbgemm__gpu.html#a13b4df4139f3c64ac4d8dbea51a7e7a0',1,'fbgemm_gpu']]], + ['binary_5fsearch_5frange_5fcpu_18',['binary_search_range_cpu',['../sparse__ops__utils_8h.html#a519154f3b89148b1b70e45d8c340ff81',1,'sparse_ops_utils.h']]], + ['block_5fbucketize_5fsparse_5ffeatures_5fcpu_19',['block_bucketize_sparse_features_cpu',['../namespacefbgemm__gpu.html#a270e4d8df103fa6c3e6750890608b566',1,'fbgemm_gpu']]], + ['block_5fbucketize_5fsparse_5ffeatures_5fcuda_20',['block_bucketize_sparse_features_cuda',['../namespacefbgemm__gpu.html#a293dc249ac4679d97747778a7fb02bd5',1,'fbgemm_gpu']]], + ['bounds_5fcheck_5f_21',['bounds_check_',['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#a0e958eecb22f175be483bef10d6e2597',1,'fbgemm_gpu::GenericPackedTensorAccessorBase']]], + ['bounds_5fcheck_5findices_5fcuda_22',['bounds_check_indices_cuda',['../group__embedding-cuda.html#gad1c20ea2ace30c269811890919ebdb6e',1,'bounds_check_indices_cuda(Tensor &rows_per_table, Tensor &indices, Tensor &offsets, int64_t bounds_check_mode_, Tensor &warning, const c10::optional< Tensor > &weights, const c10::optional< Tensor > &B_offsets, const int64_t max_B): embedding_bounds_check.cu'],['../group__embedding-cuda.html#gad1c20ea2ace30c269811890919ebdb6e',1,'bounds_check_indices_cuda(Tensor &rows_per_table, Tensor &indices, Tensor &offsets, int64_t bounds_check_mode, Tensor &warning, const c10::optional< Tensor > &weights, const c10::optional< Tensor > &B_ofsets, const int64_t max_B): embedding_bounds_check.cu']]], + ['bucketize_5fsparse_5ffeatures_5fcpu_23',['bucketize_sparse_features_cpu',['../namespacefbgemm__gpu.html#a83c70249ce058969210bda8aedf671a4',1,'fbgemm_gpu']]], + ['bucketize_5fsparse_5ffeatures_5fcuda_24',['bucketize_sparse_features_cuda',['../namespacefbgemm__gpu.html#abb94f2bd00f8ee054a4a1d2417a093d1',1,'fbgemm_gpu']]] ]; diff --git a/search/functions_3.js b/search/functions_3.js index cf90c2986..767821b32 100644 --- a/search/functions_3.js +++ b/search/functions_3.js @@ -1,5 +1,29 @@ var searchData= [ - ['generic_5fhistogram_5fbinning_5fcalibration_5fby_5ffeature_5fcpu_0',['generic_histogram_binning_calibration_by_feature_cpu',['../group__sparse-data-cpu.html#gaef2a0a8c27e3b8b2d72be5c95ba7539e',1,'fbgemm_gpu']]], - ['get_5funique_5findices_5fcuda_1',['get_unique_indices_cuda',['../group__table-batched-embed-cuda.html#ga4887151424a90cfd0abef174a4e91f3f',1,'get_unique_indices_cuda(at::Tensor linear_indices, int64_t max_indices, bool compute_count): linearize_cache_indices.cu'],['../group__table-batched-embed-cuda.html#ga4887151424a90cfd0abef174a4e91f3f',1,'get_unique_indices_cuda(Tensor linear_indices, int64_t max_indices, bool compute_count): linearize_cache_indices.cu']]] + ['calc_5foffsets_5frange_5fthread_5fblock_0',['calc_offsets_range_thread_block',['../namespacefbgemm__gpu.html#ae0656dd690bcffdd8b470d894e25b2d8',1,'fbgemm_gpu']]], + ['cat_5freorder_5fbatched_5fad_5findices_5fcpu_1',['cat_reorder_batched_ad_indices_cpu',['../namespacefbgemm__gpu.html#a1ed236113fa360c41a2eb0507c3fc2c7',1,'fbgemm_gpu']]], + ['cat_5freorder_5fbatched_5fad_5findices_5fcpu_5f_2',['cat_reorder_batched_ad_indices_cpu_',['../namespacefbgemm__gpu.html#a6b5e65a3f532db97f093037c9dcb3902',1,'fbgemm_gpu']]], + ['compact_3',['compact',['../classssd_1_1_embedding_rocks_d_b.html#a043cdfc194924194e381a986c229569e',1,'ssd::EmbeddingRocksDB']]], + ['compact_5fif_5fnecessary_4',['compact_if_necessary',['../classssd_1_1_embedding_rocks_d_b.html#a92b07dcd61720ad3a72dbbad89c26514',1,'ssd::EmbeddingRocksDB']]], + ['compute_5ffrequency_5fsequence_5',['compute_frequency_sequence',['../namespacefbgemm__gpu.html#a6b41d7b032eb1abe61eee0bd903d8dfb',1,'fbgemm_gpu']]], + ['compute_5fnum_5fuint64s_6',['compute_num_uint64s',['../namespacefbgemm__gpu.html#af861e4a8f7b669619744fe59ca2f73a3',1,'fbgemm_gpu']]], + ['convert_5ffloat_5fto_5fhalf_5fassemblefloat_7',['convert_float_to_half_assemblefloat',['../verify__fp16__stochastic__benchmark_8cu.html#abbb1b78a4249b42b116429258ac56174',1,'verify_fp16_stochastic_benchmark.cu']]], + ['convert_5ffloat_5fto_5fhalf_5fbitcarry_8',['convert_float_to_half_bitcarry',['../verify__fp16__stochastic__benchmark_8cu.html#a46898a808f7408d99e7ad4c7fc0fea2a',1,'verify_fp16_stochastic_benchmark.cu']]], + ['convert_5ffloat_5fto_5fhalf_5fdirect_9',['convert_float_to_half_direct',['../verify__fp16__stochastic__benchmark_8cu.html#a169a7087c41e8efae2d09cfc78fa802e',1,'verify_fp16_stochastic_benchmark.cu']]], + ['convert_5ffloat_5fto_5fhalf_5fshortrand_10',['convert_float_to_half_shortrand',['../verify__fp16__stochastic__benchmark_8cu.html#ab109332ca0fae3f39a7d000348a1401c',1,'verify_fp16_stochastic_benchmark.cu']]], + ['copy_11',['copy',['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#aa3322732b0a44cf924b89a066f4503d4',1,'fbgemm_gpu::Vec4T< float >::copy()'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#ad6a7665bbc9596b7b9123c9a0605fe1c',1,'fbgemm_gpu::Vec4T< at::Half >::copy()'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a140a9bcb80dcfae69a427d885d148952',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::copy()'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a8af22674533453883301576ae485699c',1,'fbgemm_gpu::Vec4T< double >::copy()']]], + ['copy_5fstr_12',['copy_str',['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#a7c56e8e49eb26679b9cf3a65c3bd38a9',1,'fbgemm_gpu::GenericPackedTensorAccessorBase']]], + ['cp_5fasync_5ffence_13',['cp_async_fence',['../namespacenbit.html#a9d3f5c31c0728bd8031522979f9fd236',1,'nbit']]], + ['cp_5fasync_5fwait_14',['cp_async_wait',['../namespacenbit.html#ab71806d51c0bb2fbc0b08fb3ed2b442e',1,'nbit']]], + ['cp_5fasync_5fwait_3c_200_20_3e_15',['cp_async_wait< 0 >',['../namespacenbit.html#a869b22b83f81fa2ed2302ceb80d9b9ca',1,'nbit']]], + ['cp_5fasync_5fzfill_16',['cp_async_zfill',['../namespacenbit.html#ac46112b67b5de646034bc1d35d44c8fe',1,'nbit']]], + ['cp_5fasync_5fzfill_5fcg_17',['cp_async_zfill_cg',['../namespacenbit.html#a7f38bc64db06ad5e5ee1b4efa55c349d',1,'nbit']]], + ['csr2csc_18',['csr2csc',['../namespaceinternal.html#adff2ce52cb6a5e84b57614a452aa77d5',1,'internal']]], + ['csr2csc_3c_20double_20_3e_19',['csr2csc< double >',['../namespaceinternal.html#ab8f896e4d2c97b1369a8e5fb7d9408b7',1,'internal']]], + ['csr2csc_3c_20float_20_3e_20',['csr2csc< float >',['../namespaceinternal.html#a3715c6c222855aa1b842c358fe2a6420',1,'internal']]], + ['cuda_5fcalc_5fblock_5fcount_21',['cuda_calc_block_count',['../sparse__ops__utils_8h.html#ab702f2479ba0bedf91c18e0b644b210a',1,'sparse_ops_utils.h']]], + ['cuda_5fcalc_5fxblock_5fcount_22',['cuda_calc_xblock_count',['../sparse__ops__utils_8h.html#a2eba06f69b5b34fe6ca0eafb0240d369',1,'sparse_ops_utils.h']]], + ['cuda_5fcalc_5fxblock_5fcount_5fbase_23',['cuda_calc_xblock_count_base',['../sparse__ops__utils_8h.html#a885f787cafec301665604303ae43a2e3',1,'sparse_ops_utils.h']]], + ['cuda_5fkernel_5floop_24',['CUDA_KERNEL_LOOP',['../namespacefbgemm__gpu.html#a14c0f0b2b6107f2b17eb472d9be9fb03',1,'fbgemm_gpu::CUDA_KERNEL_LOOP(b_t, lengths_size)'],['../namespacefbgemm__gpu.html#ab331d23c5119efeb513b36fed74c53b0',1,'fbgemm_gpu::CUDA_KERNEL_LOOP(r, lengths_size)']]], + ['cutlass_5fget_5fsmem_5fpointer_25',['cutlass_get_smem_pointer',['../namespacenbit.html#a64cf76bab7c5be6cb2b0c7d1b77443a5',1,'nbit::cutlass_get_smem_pointer(void *ptr)'],['../namespacenbit.html#a250008d643379010295dede0b64068c6',1,'nbit::cutlass_get_smem_pointer(void const *ptr)']]] ]; diff --git a/search/functions_4.js b/search/functions_4.js index 46cb13feb..ec0b17a01 100644 --- a/search/functions_4.js +++ b/search/functions_4.js @@ -1,5 +1,32 @@ var searchData= [ - ['histogram_5fbinning_5fcalibration_5fcpu_0',['histogram_binning_calibration_cpu',['../group__sparse-data-cpu.html#ga201bb2241fc9d582d6c0fe968b0e71ca',1,'fbgemm_gpu']]], - ['host_5flxu_5fcache_5fslot_1',['host_lxu_cache_slot',['../group__table-batched-embed-cuda.html#ga920da453c443675fc7fbc9d68e272a61',1,'host_lxu_cache_slot(int64_t h_in, int64_t C): lxu_cache.cu'],['../group__table-batched-embed-cuda.html#ga920da453c443675fc7fbc9d68e272a61',1,'host_lxu_cache_slot(int64_t h_in, int64_t C): lxu_cache.cu']]] + ['d_0',['D',['../classfbgemm__gpu_1_1_fixed_divisor.html#aa0904583fc7c962f6ae008052d6dadf7',1,'fbgemm_gpu::FixedDivisor']]], + ['data_1',['data',['../classfbgemm__gpu_1_1_tensor_accessor_base.html#a00eb43c6e0e2f9b3a5d083cf44bad46c',1,'fbgemm_gpu::TensorAccessorBase::data()'],['../classfbgemm__gpu_1_1_tensor_accessor_base.html#a445a0aad25aa4b10485392cab109a77b',1,'fbgemm_gpu::TensorAccessorBase::data() const'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#a00eb43c6e0e2f9b3a5d083cf44bad46c',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::data()'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#a445a0aad25aa4b10485392cab109a77b',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::data() const']]], + ['db_5fshard_2',['db_shard',['../namespacessd.html#ac0918c17a5ef4ae94a7d4068512744f9',1,'ssd']]], + ['decl_5fradix_5fsort_5fpairs_5ffn_3',['DECL_RADIX_SORT_PAIRS_FN',['../split__embeddings__utils_8cuh.html#a07c7c57b2dd34f8dcede30593003253c',1,'DECL_RADIX_SORT_PAIRS_FN(int64_t, float): split_embeddings_utils.cuh'],['../split__embeddings__utils_8cuh.html#a665ecb055cdda875801b442d35297e10',1,'DECL_RADIX_SORT_PAIRS_FN(int64_t, double): split_embeddings_utils.cuh'],['../split__embeddings__utils_8cuh.html#a68379ca489210e052be87595ff7c1ec7',1,'DECL_RADIX_SORT_PAIRS_FN(int64_t, int64_t): split_embeddings_utils.cuh'],['../split__embeddings__utils_8cuh.html#a94564bf3eeebee1b64b0fe3ba0b3b7e0',1,'DECL_RADIX_SORT_PAIRS_FN(int64_t, int32_t): split_embeddings_utils.cuh']]], + ['def_5fradix_5fsort_5fpairs_5ffn_4',['DEF_RADIX_SORT_PAIRS_FN',['../radix__sort__pairs_8cu.html#aca8b050260de3f4f24d6bb405cbbdd85',1,'DEF_RADIX_SORT_PAIRS_FN(int64_t, float): radix_sort_pairs.cu'],['../radix__sort__pairs_8cu.html#a8ff9c3ca029c1596694941f07c7b2dc4',1,'DEF_RADIX_SORT_PAIRS_FN(int64_t, double): radix_sort_pairs.cu'],['../radix__sort__pairs_8cu.html#a932f303789b405fceb31dd0f40f10d43',1,'DEF_RADIX_SORT_PAIRS_FN(int64_t, int64_t): radix_sort_pairs.cu'],['../radix__sort__pairs_8cu.html#ac3e8e7f0d44c6e7d4a5aea790dca2526',1,'DEF_RADIX_SORT_PAIRS_FN(int64_t, int32_t): radix_sort_pairs.cu']]], + ['dense_5fembedding_5fcodegen_5fforward_5funweighted_5fcuda_5',['dense_embedding_codegen_forward_unweighted_cuda',['../gen__embedding__forward__dense__unweighted__codegen__cuda_8cu.html#a840483d38dd0ee3fe4b398ebee5bf3d7',1,'dense_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_dense_unweighted_codegen_cuda.cu'],['../embedding__backward__dense__host_8cpp.html#a840483d38dd0ee3fe4b398ebee5bf3d7',1,'dense_embedding_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_dense_unweighted_codegen_cuda.cu']]], + ['dense_5fembedding_5fcodegen_5fforward_5funweighted_5fmeta_6',['dense_embedding_codegen_forward_unweighted_meta',['../gen__embedding__forward__dense__unweighted__codegen__meta_8cpp.html#ac9e6ce9ed24a999160137cd295420a9f',1,'gen_embedding_forward_dense_unweighted_codegen_meta.cpp']]], + ['dense_5fembedding_5fcodegen_5fforward_5fweighted_5fcuda_7',['dense_embedding_codegen_forward_weighted_cuda',['../gen__embedding__forward__dense__weighted__codegen__cuda_8cu.html#a4e4e521f171d17c5d78bee2b3c9b21db',1,'dense_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_dense_weighted_codegen_cuda.cu'],['../embedding__backward__dense__host_8cpp.html#a4e4e521f171d17c5d78bee2b3c9b21db',1,'dense_embedding_codegen_forward_weighted_cuda(const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t total_D, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const int64_t pooling_mode, const Tensor &indice_weights, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_dense_weighted_codegen_cuda.cu']]], + ['dense_5fembedding_5fcodegen_5fforward_5fweighted_5fmeta_8',['dense_embedding_codegen_forward_weighted_meta',['../gen__embedding__forward__dense__weighted__codegen__meta_8cpp.html#ac89d0c2dc36fc6053f0425a919711b3a',1,'gen_embedding_forward_dense_weighted_codegen_meta.cpp']]], + ['dense_5fembedding_5fcodegen_5fgrad_5findice_5fweights_5fcuda_9',['dense_embedding_codegen_grad_indice_weights_cuda',['../gen__embedding__backward__dense__indice__weights__codegen__cuda_8cu.html#aa413d80f0ebbadd4375b29cfb27654b3',1,'dense_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &feature_requires_grad): gen_embedding_backward_dense_indice_weights_codegen_cuda.cu'],['../embedding__backward__dense__host_8cpp.html#aa413d80f0ebbadd4375b29cfb27654b3',1,'dense_embedding_codegen_grad_indice_weights_cuda(const Tensor &grad_output, const Tensor &dev_weights, const Tensor &weights_offsets, const Tensor &D_offsets, const int64_t max_D, const Tensor &indices, const Tensor &offsets, const Tensor &feature_requires_grad): gen_embedding_backward_dense_indice_weights_codegen_cuda.cu']]], + ['dense_5fembedding_5fnobag_5fcodegen_5fforward_5funweighted_5fcuda_10',['dense_embedding_nobag_codegen_forward_unweighted_cuda',['../gen__embedding__forward__dense__unweighted__codegen__cuda_8cu.html#aadd3974603c08fba6a7c21638a57e7f4',1,'dense_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_dense_unweighted_codegen_cuda.cu'],['../embedding__backward__dense__host_8cpp.html#aadd3974603c08fba6a7c21638a57e7f4',1,'dense_embedding_nobag_codegen_forward_unweighted_cuda(const Tensor &dev_weights, const Tensor &weights_offsets, const int64_t D, const Tensor &indices, const Tensor &offsets, const int64_t output_dtype, const bool is_experimental): gen_embedding_forward_dense_unweighted_codegen_cuda.cu']]], + ['dense_5fembedding_5fnobag_5fcodegen_5fforward_5funweighted_5fmeta_11',['dense_embedding_nobag_codegen_forward_unweighted_meta',['../gen__embedding__forward__dense__unweighted__codegen__meta_8cpp.html#ac9b06d5bef944e3f22c1b7d5faf0cc73',1,'gen_embedding_forward_dense_unweighted_codegen_meta.cpp']]], + ['dense_5fto_5fjagged_12',['dense_to_jagged',['../group__jagged-tensor-ops-cpu.html#gae25fa8a028fc083f06e445e1d2ebb208',1,'fbgemm_gpu']]], + ['dense_5fto_5fjagged_5fforward_13',['dense_to_jagged_forward',['../namespacefbgemm__gpu.html#aa5a76157eb45b9bd4159a548e8a73ce6',1,'fbgemm_gpu']]], + ['dequantize_5fload_14',['dequantize_load',['../namespacefbgemm__gpu.html#aee340827dbc6c104a400c30f47f3ee3b',1,'fbgemm_gpu::dequantize_load(const src_t *value, const float2)'],['../namespacefbgemm__gpu.html#a74358134402be54c82696697fe766b9a',1,'fbgemm_gpu::dequantize_load(const uint8_t *value, const float2 qparams)'],['../namespacefbgemm__gpu.html#aaed854f05a4542637ac342bfab57bdc7',1,'fbgemm_gpu::dequantize_load(const uint8_t *value, const float2 qparams)']]], + ['dequantize_5fpacked_5fhfp8_15',['dequantize_packed_hfp8',['../namespacefbgemm__gpu.html#a0c388276a962d14b3070dc55202eaf66',1,'fbgemm_gpu']]], + ['dequantize_5fpermuted_5fint2_16',['dequantize_permuted_int2',['../namespacefbgemm__gpu.html#a96be7f5b4c81d93bf024348e7b85e364',1,'fbgemm_gpu']]], + ['dequantize_5fpermuted_5fint4_17',['dequantize_permuted_int4',['../namespacefbgemm__gpu.html#a2cf47d59251a0840fd370a95fa371681',1,'fbgemm_gpu']]], + ['dequantize_5fpermuted_5fint8_18',['dequantize_permuted_int8',['../namespacefbgemm__gpu.html#adec3504b0909c4380da3c0aac89055de',1,'fbgemm_gpu']]], + ['direct_5fmapped_5flru_5fcache_5fpopulate_5fbyte_5fcpu_19',['direct_mapped_lru_cache_populate_byte_cpu',['../namespacefbgemm__gpu.html#ac827cf6cd0f063a6747deaff14e4902d',1,'fbgemm_gpu']]], + ['direct_5fmapped_5flru_5fcache_5fpopulate_5fbyte_5fcuda_20',['direct_mapped_lru_cache_populate_byte_cuda',['../group__table-batched-embed-cuda.html#gae019b6879bd9f89a146e0700d5a4bd8b',1,'direct_mapped_lru_cache_populate_byte_cuda(at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, at::Tensor lxu_cache_miss_timestamp, int64_t row_alignment, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats): split_embeddings_cache_cuda.cuh'],['../lru__cache__populate__byte_8cu.html#ab944b6f7e1df36b8ef0c4a911c1b0afb',1,'direct_mapped_lru_cache_populate_byte_cuda(Tensor weights, Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor linear_cache_indices, Tensor lxu_cache_state, Tensor lxu_cache_weights, int64_t time_stamp, Tensor lru_state, Tensor lxu_cache_miss_timestamp, int64_t row_alignment, bool gather_cache_stats, c10::optional< Tensor > uvm_cache_stats): lru_cache_populate_byte.cu']]], + ['direct_5fmapped_5flxu_5fcache_5flookup_5fcpu_21',['direct_mapped_lxu_cache_lookup_cpu',['../namespacefbgemm__gpu.html#a03949dd527b81758e43a4b48800c3bc6',1,'fbgemm_gpu']]], + ['direct_5fmapped_5flxu_5fcache_5flookup_5fcuda_22',['direct_mapped_lxu_cache_lookup_cuda',['../group__table-batched-embed-cuda.html#gab305ebdd3822794c5ac462bf5df4bb49',1,'direct_mapped_lxu_cache_lookup_cuda(at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats): split_embeddings_cache_cuda.cuh'],['../lxu__cache_8cu.html#a9a01f6df03e867e1871df306a6289e06',1,'direct_mapped_lxu_cache_lookup_cuda(Tensor linear_cache_indices, Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, c10::optional< Tensor > uvm_cache_stats): lxu_cache.cu']]], + ['div_23',['div',['../structfbgemm__gpu_1_1_vec4_acc_t.html#a36a62a848632d6968fe6723ee19277da',1,'fbgemm_gpu::Vec4AccT']]], + ['div_24',['Div',['../classfbgemm__gpu_1_1_fixed_divisor.html#a74e5cb4569d6d48cbc0ee32674a7e374',1,'fbgemm_gpu::FixedDivisor']]], + ['div_5fround_5fup_25',['div_round_up',['../namespacenbit.html#a620ba1c7dba3e279e09759758b7a86db',1,'nbit::div_round_up()'],['../namespacefbgemm__gpu.html#a1e5f0f7703057bbda166a7723b16e6ef',1,'fbgemm_gpu::div_round_up()']]], + ['divmod_26',['DivMod',['../classfbgemm__gpu_1_1_fixed_divisor.html#abea2bdfe3649f1b944a15453e78ae523',1,'fbgemm_gpu::FixedDivisor::DivMod()'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a31faa05b32d14aec34e66800b6092329',1,'DivMod(global_warp_id, &t, &table_warp_id): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a31faa05b32d14aec34e66800b6092329',1,'DivMod(global_warp_id, &t, &table_warp_id): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../namespacefbgemm__gpu.html#aef6bada16cf81832eb1e594eb47875d8',1,'fbgemm_gpu::DivMod(global_warp_id, reinterpret_cast< int32_t * >(&list_id), reinterpret_cast< int32_t * >(&warp_id))']]], + ['dummy_5fpacked_5faccessor32_27',['dummy_packed_accessor32',['../namespacefbgemm__gpu.html#a86a8cc18b54f6986ec4faeec0b223907',1,'fbgemm_gpu']]], + ['dummy_5fpacked_5faccessor64_28',['dummy_packed_accessor64',['../namespacefbgemm__gpu.html#aeb6f64d8ceb0189b03aa6808b97e8b16',1,'fbgemm_gpu']]] ]; diff --git a/search/functions_5.js b/search/functions_5.js index 1db2fa6fd..393e65828 100644 --- a/search/functions_5.js +++ b/search/functions_5.js @@ -1,5 +1,16 @@ var searchData= [ - ['int_5fnbit_5fsplit_5fembedding_5fuvm_5fcaching_5fcodegen_5flookup_5ffunction_0',['int_nbit_split_embedding_uvm_caching_codegen_lookup_function',['../group__embedding-cuda.html#gabbe880100f1036a979f3a8d8755447d0',1,'embedding_forward_quantized_host.cpp']]], - ['is_5fuvm_5ftensor_1',['is_uvm_tensor',['../group__cumem-utils.html#ga0b9f28b07d3796a732b1fb73b8e10e7e',1,'fbgemm_gpu']]] + ['element_5fwise_5fmul_5f_0',['element_wise_mul_',['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a8c36671f882604ae41f214e978ebf04b',1,'fbgemm_gpu::Vec4T< float >::element_wise_mul_()'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a8c36671f882604ae41f214e978ebf04b',1,'fbgemm_gpu::Vec4T< at::Half >::element_wise_mul_(const Vec4T< float > &a)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#ae653589cf39f92811f8509363515532d',1,'fbgemm_gpu::Vec4T< at::Half >::element_wise_mul_(const Vec4T< at::Half > &a)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a8c36671f882604ae41f214e978ebf04b',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::element_wise_mul_(const Vec4T< float > &a)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#ae653589cf39f92811f8509363515532d',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::element_wise_mul_(const Vec4T< at::Half > &a)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a077873e0dd3516731c2302c7b3dee475',1,'fbgemm_gpu::Vec4T< double >::element_wise_mul_()']]], + ['embedding_5fbag_5frowwise_5fprune_1',['embedding_bag_rowwise_prune',['../namespacefbgemm__gpu.html#ae586c9948dba8a67abf44ada58425fba',1,'fbgemm_gpu']]], + ['embedding_5finplace_5fupdate_5fcpu_2',['embedding_inplace_update_cpu',['../namespacefbgemm__gpu.html#aaa1807fa25793e61743b75d27db063cc',1,'fbgemm_gpu']]], + ['embedding_5finplace_5fupdate_5fcpu_5fkernel_3',['embedding_inplace_update_cpu_kernel',['../namespacefbgemm__gpu.html#af3e9e1ce0f6340f233ef6ae8934454cf',1,'fbgemm_gpu']]], + ['embedding_5finplace_5fupdate_5fcuda_4',['embedding_inplace_update_cuda',['../namespacefbgemm__gpu.html#a54bf7e9b54b5263cf039100cda517c34',1,'fbgemm_gpu']]], + ['embeddingrocksdb_5',['EmbeddingRocksDB',['../classssd_1_1_embedding_rocks_d_b.html#a703b26ce10b84fa35ea496114f1ebbb5',1,'ssd::EmbeddingRocksDB']]], + ['emulate_5fcache_5fmiss_6',['emulate_cache_miss',['../split__embeddings__cache__cuda_8cuh.html#a8f112d04838c2019df06ffbb84dbafba',1,'emulate_cache_miss(at::Tensor lxu_cache_locations, const int64_t enforced_misses_per_256, const bool gather_cache_stats, at::Tensor uvm_cache_stats): lru_cache_find.cu'],['../lru__cache__find_8cu.html#a8a80ce6ea3d62b9f22ac391767b34538',1,'emulate_cache_miss(Tensor lxu_cache_locations, const int64_t enforced_misses_per_256, const bool gather_cache_stats, Tensor uvm_cache_stats): lru_cache_find.cu']]], + ['enum_5fquery_7',['enum_query',['../classfbgemm__gpu_1_1enum__registration.html#a84cad106fb24ea59687f6708d197cc64',1,'fbgemm_gpu::enum_registration']]], + ['enum_5fregistration_8',['enum_registration',['../classfbgemm__gpu_1_1enum__registration.html#afa13a8542c6dde450214a387cacf3a9b',1,'fbgemm_gpu::enum_registration']]], + ['evict_9',['evict',['../structfbgemm__gpu_1_1_weight_row.html#a64c9f91fe6b60f7294ce6bb363bdb234',1,'fbgemm_gpu::WeightRow']]], + ['exclusive_5fscan_5fptrs_5fcpu_10',['exclusive_scan_ptrs_cpu',['../namespacefbgemm__gpu.html#aa8eb0fcd765dc4580084f6d098604e0d',1,'fbgemm_gpu']]], + ['expand_5finto_5fjagged_5fpermute_5fcpu_11',['expand_into_jagged_permute_cpu',['../namespacefbgemm__gpu.html#a02fab30a12d9d6ee6e6ae68bc8041481',1,'fbgemm_gpu']]], + ['expand_5finto_5fjagged_5fpermute_5fcuda_12',['expand_into_jagged_permute_cuda',['../group__sparse-data-cuda.html#ga2402de1c0102b21af5f2bd5a50d30309',1,'fbgemm_gpu']]] ]; diff --git a/search/functions_6.js b/search/functions_6.js index 279f7e7e4..e21182499 100644 --- a/search/functions_6.js +++ b/search/functions_6.js @@ -1,6 +1,48 @@ var searchData= [ - ['jagged_5fdense_5felementwise_5fadd_0',['jagged_dense_elementwise_add',['../group__jagged-tensor-ops-cpu.html#gaa797caaa08c70857433ae987d9cf30d7',1,'fbgemm_gpu']]], - ['jagged_5fdense_5felementwise_5fadd_5fjagged_5foutput_1',['jagged_dense_elementwise_add_jagged_output',['../group__jagged-tensor-ops-cpu.html#ga1290f40c3ba39837dd009c3006353d7c',1,'fbgemm_gpu']]], - ['jagged_5fdense_5felementwise_5fadd_5fjagged_5foutput_5fcuda_2',['jagged_dense_elementwise_add_jagged_output_cuda',['../group__jagged-tensor-ops-cuda.html#gad34ac20d2c9be5a6489c8e8befff7938',1,'fbgemm_gpu']]] + ['false_0',['false',['../gen__embedding__backward__split__grad_8cu.html#a05118d1db073d73fe80ee01b40791cf6',1,'false(): gen_embedding_backward_split_grad.cu'],['../namespacenbit.html#af9110ca4f61dbcc64cf0f8118cdc97f1',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a249c23ff8c01f39126136bc2539952fe',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a508c0bc5d94dee1c736f755730ca2beb',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a4c70aaadd08c9449d6cedae3e20ea68c',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a90040b4a20a116df4d0c66c160e6e764',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#aab2d7afb4b654ce45cfc2748e78ac253',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a308832faa1970c724a5589233e352f17',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a04aec5313af7eaae824c4738345d4b6a',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a741f318d94db0cb3578afea1e4630cc9',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a7fd32cfedb1f12bb236748026afb62f0',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a6798d1239a1e727f202aa623317a936c',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#ae6208ce34aaecc5de1eea88805352dda',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a0b028a0d4eab6f827b0747e791479111',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#ab5d4641eabcd497e393236456c66f662',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a92aa5aa305b64d0be3324318e749f727',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a1628074b31c14dcc07fd3d859e9ddf89',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t row_alignment, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a6502e80c3fcff2fd9816c54de76346c5',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a62b93a28ed713cca24870802bd016e03',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a59ea73f8b7947242291927c972ebf040',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a0525091bf8439436819eef72a5c45ca6',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#ae661502dfcff9025fb909b009a194e2f',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::Half, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a4ef67d9b7b4ba3292ad30493c9daf596',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< at::BFloat16, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a3f1b79dd7ed41442b0dfb240f2ab0ec7',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< float, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../namespacenbit.html#a18e19fee6513187e93010f11a932f6de',1,'nbit::false(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< uint8_t, 1, at::RestrictPtrTraits > weights_tys, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > offsets, const int64_t pooling_mode, const int64_t row_alignment, pta::PackedTensorAccessor32< float, 1, at::RestrictPtrTraits > indice_weights, const int exponent_bits, const int exponent_bias, pta::PackedTensorAccessor32< uint8_t, 2, at::RestrictPtrTraits > output, const pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations)'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a311bf35bff79e995c3e6d7d2e6a69952',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a8c5c41f01ea1d775126bc194e1e95ecc',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a16040890e0367b0669f51c05b4715ecd',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a8f19e545f5c45f11ee4c5898decb994c',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a54d26a841bc71bb0c9fdcb2f657d3058',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aaaa117179cc47a2a2fbdb86da6066081',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aff4f86de443efa79fda96f93b78b26d4',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a0c3c6fbc30353d25b4ada5dba7ed9ad3',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a71b5f71e99a903571a45d1bfb5dd6537',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a678e19ebc31d391a2101878805cfec04',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a5e71bf5354b291e99138e5b51a2c8987',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a39a233002f8c2aadb3206424d3cf33ed',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#afb14ab09e129e59e6e323cc8ad114e0a',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ac335cccca06f6bd0865b65bb20192a24',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a6a008e7d608ca15741939511b1f48878',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#acb117339908a6826b75877db094f909d',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, uint8_t *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#afbb29ff03c359916c050f25deac56e9e',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a1695088ded9f86314e0bc374c4ad57f9',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#af26c8601b994cb4ad7a7d08104ccc876',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#ab84745c1fc3e4c483778cc8dc325eb7f',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, at::Half *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a0b7156fcc5a6e05dd2ab1a0dd33f339d',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a5acddab9f4eec4c91ba1403005c3ec7d',1,'false(const uint8_t *__restrict__ const dev_weights, const uint8_t *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a79860db3c0c6c510a821d9ac0a4c6764',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const float *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a7ccf30944601039563603d837470824c',1,'false(const float *__restrict__ const dev_weights, const float *__restrict__ const uvm_weights, const at::Half *__restrict__ const lxu_cache_weights, const int32_t *__restrict__ const weights_placements, const uint32_t B, const uint32_t T, const bool mean_pooling, const uint32_t max_D_cache, const FixedDivisor fd_num_warps_per_table, const int64_t *__restrict__ const indices, const float *__restrict__ const index_weights, const int64_t *__restrict__ const offsets, const uint32_t *__restrict__ const D_offsets, const int64_t *__restrict__ const weights_offsets, const int32_t *__restrict__ const lxu_cache_locations, float *__restrict__ const output): gen_embedding_forward_split_weighted_v2_kernel.cu']]], + ['fbgemm_5fgpu_5fenum_5fregister_5fstart_1',['FBGEMM_GPU_ENUM_REGISTER_START',['../namespacefbgemm__gpu.html#a0e41e402bfba1e346c6dcc610252e94b',1,'fbgemm_gpu']]], + ['fbgemm_5fop_5fdispatch_2',['FBGEMM_OP_DISPATCH',['../batched__dense__vec__jagged__2d__mul__backward_8cu.html#a505e960fb46aaed90cbf00060c4f7f73',1,'FBGEMM_OP_DISPATCH(CUDA, "batched_dense_vec_jagged_2d_mul_backward", fbgemm_gpu::batched_dense_vec_jagged_2d_mul_backward): batched_dense_vec_jagged_2d_mul_backward.cu'],['../batched__dense__vec__jagged__2d__mul__forward_8cu.html#ae6d9314c75be8852a64432f06a618a51',1,'FBGEMM_OP_DISPATCH(CUDA, "batched_dense_vec_jagged_2d_mul_forward", fbgemm_gpu::batched_dense_vec_jagged_2d_mul_forward): batched_dense_vec_jagged_2d_mul_forward.cu'],['../dense__to__jagged__forward_8cu.html#a2f09e89f2172cc358cfffdc866220276',1,'FBGEMM_OP_DISPATCH(CUDA, "dense_to_jagged_forward", fbgemm_gpu::dense_to_jagged_forward): dense_to_jagged_forward.cu'],['../jagged__dense__bmm__forward_8cu.html#a10db24b3c6258b287f12eb591b6b1274',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_dense_bmm_forward", fbgemm_gpu::jagged_dense_bmm_forward_cuda): jagged_dense_bmm_forward.cu'],['../jagged__dense__dense__elementwise__add__jagged__output__forward_8cu.html#a4dc38a80ec480c8ba5e73920df40ade3',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_dense_dense_elementwise_add_jagged_output_forward", fbgemm_gpu::jagged_dense_dense_elementwise_add_jagged_output_forward): jagged_dense_dense_elementwise_add_jagged_output_forward.cu'],['../jagged__dense__elementwise__mul__backward_8cu.html#a56064ede1846b15cd7ee664d3ac0f447',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_dense_elementwise_mul_backward", fbgemm_gpu::jagged_dense_elementwise_mul_backward): jagged_dense_elementwise_mul_backward.cu'],['../jagged__dense__elementwise__mul__forward_8cu.html#a55ae1a4e6489decd594fc7c77fb36cd4',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_dense_elementwise_mul_forward", fbgemm_gpu::jagged_dense_elementwise_mul_forward): jagged_dense_elementwise_mul_forward.cu'],['../jagged__index__add__2d__forward_8cu.html#a6fbf3dbceb513f8dfa17d68303b4e1f1',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_index_add_2d_forward", fbgemm_gpu::jagged_index_add_2d_forward_cuda): jagged_index_add_2d_forward.cu'],['../jagged__index__select__2d__forward_8cu.html#a769ab9425e6b9229e5197a606072f7f7',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_index_select_2d_forward", fbgemm_gpu::jagged_index_select_2d_forward_cuda): jagged_index_select_2d_forward.cu'],['../jagged__jagged__bmm__forward_8cu.html#ad970c4b273bd75194ccced952b277f40',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_jagged_bmm_forward", fbgemm_gpu::jagged_jagged_bmm_forward_cuda): jagged_jagged_bmm_forward.cu'],['../jagged__softmax__backward_8cu.html#af86af3150ade27ed65bffd51e7fd389a',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_softmax_backward", fbgemm_gpu::jagged_softmax_backward_cuda): jagged_softmax_backward.cu'],['../jagged__softmax__forward_8cu.html#ad64b64d7d37e8e47389d74bbb5b9287f',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_softmax_forward", fbgemm_gpu::jagged_softmax_forward_cuda): jagged_softmax_forward.cu'],['../jagged__tensor__ops_8cu.html#ae9145e7dc8cdcfab08478c78e11806ee',1,'FBGEMM_OP_DISPATCH(CUDA, "dense_to_jagged", fbgemm_gpu::dense_to_jagged): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#af36ae71857641f82f406e9d03287e165',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_to_padded_dense", fbgemm_gpu::jagged_to_padded_dense): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#a63e1ce09a4f40dd4f79b7ceb985b2faf',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_dense_elementwise_add", fbgemm_gpu::jagged_dense_elementwise_add): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#a4f366150fd0ce1400047ea614232e9f8',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_dense_dense_elementwise_add_jagged_output", fbgemm_gpu::jagged_dense_dense_elementwise_add_jagged_output): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#afd6b82766bc27ff6c2e957e57ec2947e',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_dense_elementwise_mul", fbgemm_gpu::jagged_dense_elementwise_mul): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#a83e06ed43d316e587c86bd1b83a233a8',1,'FBGEMM_OP_DISPATCH(CUDA, "batched_dense_vec_jagged_2d_mul", fbgemm_gpu::batched_dense_vec_jagged_2d_mul): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#a5a65d954fda4f3313d036b22b3232872',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_1d_to_dense", fbgemm_gpu::jagged_1d_to_dense): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#a84c5e68f36966340db42aa25785290df',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_2d_to_dense", fbgemm_gpu::jagged_2d_to_dense): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#a6b3f90be325532b25c5df0c87c15e083',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_softmax", fbgemm_gpu::jagged_softmax): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#a56fea1ad733f259a42c89661e1bf2637',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_jagged_bmm", fbgemm_gpu::jagged_jagged_bmm): jagged_tensor_ops.cu'],['../jagged__tensor__ops_8cu.html#a48e6bd6975582a7ce4ceff6712fa6ef9',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_dense_bmm", fbgemm_gpu::jagged_dense_bmm): jagged_tensor_ops.cu'],['../jagged__to__padded__dense__backward_8cu.html#a0ec346f5fe59608b8e13809432c9a389',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_to_padded_dense_backward", fbgemm_gpu::jagged_to_padded_dense_backward): jagged_to_padded_dense_backward.cu'],['../jagged__to__padded__dense__forward_8cu.html#a1526839450b4cbf68a2d6a70673e273a',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_to_padded_dense_forward", fbgemm_gpu::jagged_to_padded_dense_forward): jagged_to_padded_dense_forward.cu'],['../jagged__to__padded__dense__forward_8cu.html#a9797a098549c8193d6beb70cb5d7da4f',1,'FBGEMM_OP_DISPATCH(CUDA, "stacked_jagged_1d_to_dense", fbgemm_gpu::stacked_jagged_1d_to_dense_gpu): jagged_to_padded_dense_forward.cu'],['../jagged__to__padded__dense__forward_8cu.html#a84d4e43e8339a03b14fe872dd3b2d50a',1,'FBGEMM_OP_DISPATCH(CUDA, "stacked_jagged_2d_to_dense", fbgemm_gpu::stacked_jagged_2d_to_dense_gpu): jagged_to_padded_dense_forward.cu'],['../jagged__to__padded__dense__forward_8cu.html#a61110a1a4f03edaa3322b245624b294e',1,'FBGEMM_OP_DISPATCH(CUDA, "stacked_jagged_2d_to_dense_forward", fbgemm_gpu::stacked_jagged_2d_to_dense_forward_cuda): jagged_to_padded_dense_forward.cu'],['../jagged__to__padded__dense__forward_8cu.html#a1a53264bb9ade4d2796b87a966ab450c',1,'FBGEMM_OP_DISPATCH(CUDA, "stacked_jagged_2d_to_dense_backward", fbgemm_gpu::stacked_jagged_2d_to_dense_backward_cuda): jagged_to_padded_dense_forward.cu'],['../jagged__to__padded__dense__forward_8cu.html#a65d732670fec1bee849caf445b2903e7',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_dense_elementwise_add_jagged_output", fbgemm_gpu::jagged_dense_elementwise_add_jagged_output_cuda): jagged_to_padded_dense_forward.cu'],['../jagged__unique__indices_8cu.html#a674314745cbd8dd913142d0660083851',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_unique_indices", fbgemm_gpu::jagged_unique_indices_cuda): jagged_unique_indices.cu'],['../jagged__unique__indices_8cu.html#aaf228a3ce26c3ae9c749573883b59be5',1,'FBGEMM_OP_DISPATCH(CUDA, "jagged_hash_size_cumsum", fbgemm_gpu::jagged_hash_size_cumsum_cuda): jagged_unique_indices.cu'],['../keyed__jagged__index__select__dim1_8cu.html#a69db0b3f600c7c45db29069cd05d3bea',1,'FBGEMM_OP_DISPATCH(CUDA, "keyed_jagged_index_select_dim1", fbgemm_gpu::keyed_jagged_index_select_dim_1_gpu): keyed_jagged_index_select_dim1.cu'],['../merge__pooled__embedding__ops__cpu_8cpp.html#a1ec90ab98c9d6c18099549dce392fd65',1,'FBGEMM_OP_DISPATCH(CPU, "merge_pooled_embeddings", fbgemm_gpu::merge_pooled_embeddings_cpu): merge_pooled_embedding_ops_cpu.cpp'],['../permute__pooled__embedding__ops__cpu_8cpp.html#a37755fb9333b1017d34b49ee0247004e',1,'FBGEMM_OP_DISPATCH(CPU, "permute_pooled_embs", fbgemm_gpu::permute_pooled_embs_cpu): permute_pooled_embedding_ops_cpu.cpp'],['../permute__pooled__embedding__ops__cpu_8cpp.html#a83bf468fc58e605fc64461726caad8cf',1,'FBGEMM_OP_DISPATCH(CPU, "permute_pooled_embs_auto_grad", fbgemm_gpu::permute_pooled_embs_auto_grad_cpu): permute_pooled_embedding_ops_cpu.cpp'],['../permute__pooled__embedding__ops__cpu_8cpp.html#a765ed01147edbd93b01e5f91fe12f68b',1,'FBGEMM_OP_DISPATCH(CPU, "permute_duplicate_pooled_embs", fbgemm_gpu::permute_duplicate_pooled_embs_cpu): permute_pooled_embedding_ops_cpu.cpp'],['../permute__pooled__embedding__ops__cpu_8cpp.html#aa0ac9a165fb46ae5738c08e0a887a97b',1,'FBGEMM_OP_DISPATCH(CPU, "permute_duplicate_pooled_embs_auto_grad", fbgemm_gpu::permute_duplicate_pooled_embs_auto_grad_cpu): permute_pooled_embedding_ops_cpu.cpp'],['../permute__pooled__embedding__ops__cpu_8cpp.html#a941e973d6b74e10046ae3373ba10bda2',1,'FBGEMM_OP_DISPATCH(Meta, "permute_pooled_embs", fbgemm_gpu::permute_pooled_embs_meta): permute_pooled_embedding_ops_cpu.cpp'],['../permute__pooled__embedding__ops__cpu_8cpp.html#a7590e07b38befcd57df567cb054cfad3',1,'FBGEMM_OP_DISPATCH(Meta, "permute_pooled_embs_auto_grad", fbgemm_gpu::permute_pooled_embs_auto_grad_meta): permute_pooled_embedding_ops_cpu.cpp'],['../permute__pooled__embedding__ops__cpu_8cpp.html#a858ecafbed2f155f42fe99391b82e4b4',1,'FBGEMM_OP_DISPATCH(Autograd, "permute_pooled_embs_auto_grad", fbgemm_gpu::permute_pooled_embs_auto_grad): permute_pooled_embedding_ops_cpu.cpp'],['../quantize__bfloat16_8cu.html#a44eca6a446116eaa006c5bd0488d62f2',1,'FBGEMM_OP_DISPATCH(CUDA, "Bfloat16QuantizedToFloat", fbgemm_gpu::_bfloat16_to_float_gpu): quantize_bfloat16.cu'],['../quantize__bfloat16_8cu.html#a4ed2eb1cae3301906c55dc98ee5ce687',1,'FBGEMM_OP_DISPATCH(CUDA, "FloatToBfloat16Quantized", fbgemm_gpu::_float_to_bfloat16_gpu): quantize_bfloat16.cu'],['../quantize__fused__8bit__rowwise_8cu.html#a360b78a6e199bcda032c8896708398db',1,'FBGEMM_OP_DISPATCH(CUDA, "FloatToFused8BitRowwiseQuantized", fbgemm_gpu::_float_to_fused8bitrowwise_gpu): quantize_fused_8bit_rowwise.cu'],['../quantize__fused__8bit__rowwise_8cu.html#afed513cf23a1957fa7f44309ed54288e',1,'FBGEMM_OP_DISPATCH(CUDA, "HalfToFused8BitRowwiseQuantized", fbgemm_gpu::_half_to_fused8bitrowwise_gpu): quantize_fused_8bit_rowwise.cu'],['../quantize__fused__8bit__rowwise_8cu.html#af35eb9fa075d341e379886496b6f2dad',1,'FBGEMM_OP_DISPATCH(CUDA, "FloatOrHalfToFused8BitRowwiseQuantized", fbgemm_gpu::_single_or_half_precision_to_fused8bitrowwise_gpu): quantize_fused_8bit_rowwise.cu'],['../quantize__fused__8bit__rowwise_8cu.html#ac2c5ae3ba26c4c71b5e42651752f6e05',1,'FBGEMM_OP_DISPATCH(CUDA, "Fused8BitRowwiseQuantizedToFloat", fbgemm_gpu::_fused8bitrowwise_to_float_gpu): quantize_fused_8bit_rowwise.cu'],['../quantize__fused__8bit__rowwise_8cu.html#a5ed3f01bedfeee57b88e3343ebab204a',1,'FBGEMM_OP_DISPATCH(CUDA, "Fused8BitRowwiseQuantizedToHalf", fbgemm_gpu::_fused8bitrowwise_to_half_gpu): quantize_fused_8bit_rowwise.cu'],['../quantize__fused__8bit__rowwise_8cu.html#ac5c42d23d15559e0fab4a67b274ac722',1,'FBGEMM_OP_DISPATCH(CUDA, "Fused8BitRowwiseQuantizedToFloatOrHalf", fbgemm_gpu::_fused8bitrowwise_to_single_or_half_precision_gpu): quantize_fused_8bit_rowwise.cu'],['../quantize__fused__8bit__rowwise_8cu.html#a36f61e129797f0efa0fa02acd3bf1628',1,'FBGEMM_OP_DISPATCH(CUDA, "Fused8BitRowwiseQuantizedToFloatMixedDim", fbgemm_gpu::_fused8bitrowwise_to_float_mixed_dim_gpu): quantize_fused_8bit_rowwise.cu'],['../quantize__fused__nbit__rowwise_8cu.html#ac0d21a1093187621384e9f7ee12af6f5',1,'FBGEMM_OP_DISPATCH(CUDA, "FloatToFusedNBitRowwiseQuantizedSBHalf", fbgemm_gpu::_float_to_fusednbitrowwise_gpu): quantize_fused_nbit_rowwise.cu'],['../quantize__fused__nbit__rowwise_8cu.html#acc803cc30f01a51dcba4d3e89471a836',1,'FBGEMM_OP_DISPATCH(CUDA, "HalfToFusedNBitRowwiseQuantizedSBHalf", fbgemm_gpu::_half_to_fusednbitrowwise_gpu): quantize_fused_nbit_rowwise.cu'],['../quantize__fused__nbit__rowwise_8cu.html#a9235db627f7b35c43f5a8baee9c6e73f',1,'FBGEMM_OP_DISPATCH(CUDA, "FloatOrHalfToFusedNBitRowwiseQuantizedSBHalf", fbgemm_gpu::_float_or_half_to_fusednbitrowwise_gpu): quantize_fused_nbit_rowwise.cu'],['../quantize__fused__nbit__rowwise_8cu.html#a04df767b706b47ca163b528c0ec49659',1,'FBGEMM_OP_DISPATCH(CUDA, "FusedNBitRowwiseQuantizedSBHalfToFloat", fbgemm_gpu::_fusednbitrowwise_to_float_gpu): quantize_fused_nbit_rowwise.cu'],['../quantize__fused__nbit__rowwise_8cu.html#ae8e33c20c4bfee06ceac1b42b87d40e0',1,'FBGEMM_OP_DISPATCH(CUDA, "FusedNBitRowwiseQuantizedSBHalfToHalf", fbgemm_gpu::_fusednbitrowwise_to_half_gpu): quantize_fused_nbit_rowwise.cu'],['../quantize__fused__nbit__rowwise_8cu.html#af782044b726c577b026de55ab1e37681',1,'FBGEMM_OP_DISPATCH(CUDA, "FusedNBitRowwiseQuantizedSBHalfToFloatOrHalf", fbgemm_gpu::_fusednbitrowwise_to_float_or_half_gpu): quantize_fused_nbit_rowwise.cu'],['../quantize__hfp8_8cu.html#a38d08a293e27467dfdda3bb72ea27596',1,'FBGEMM_OP_DISPATCH(CUDA, "FloatToHFP8Quantized", fbgemm_gpu::_float_to_hfp8_gpu): quantize_hfp8.cu'],['../quantize__hfp8_8cu.html#a137d7c9cbf1612b410dd45b3bbebbea0',1,'FBGEMM_OP_DISPATCH(CUDA, "HFP8QuantizedToFloat", fbgemm_gpu::_hfp8_to_float_gpu): quantize_hfp8.cu'],['../quantize__msfp_8cu.html#abba68956be833439bf5ecabfe3880300',1,'FBGEMM_OP_DISPATCH(CUDA, "FloatToMSFPQuantized", fbgemm_gpu::_float_to_msfp_gpu): quantize_msfp.cu'],['../quantize__msfp_8cu.html#ace6d6f85efbdd32b7378b07a2e394166',1,'FBGEMM_OP_DISPATCH(CUDA, "MSFPQuantizedToFloat", fbgemm_gpu::_msfp_to_float_gpu): quantize_msfp.cu'],['../quantize__ops__gpu_8cpp.html#a0d298145c58d3db95b0838ab9e321626',1,'FBGEMM_OP_DISPATCH(CUDA, "FloatToFP8RowwiseQuantized", fbgemm_gpu::_float_to_FP8rowwise_gpu): quantize_ops_gpu.cpp'],['../quantize__ops__gpu_8cpp.html#a0ae0af8cb484307360d889119a25a870',1,'FBGEMM_OP_DISPATCH(CUDA, "FP8RowwiseQuantizedToFloat", fbgemm_gpu::_FP8rowwise_to_float_gpu): quantize_ops_gpu.cpp'],['../quantize__ops__gpu_8cpp.html#a6f70026edd736cca0ec96d6369571e06',1,'FBGEMM_OP_DISPATCH(CUDA, "FloatToPaddedFP8RowwiseQuantized", fbgemm_gpu::_float_to_paddedFP8rowwise_gpu): quantize_ops_gpu.cpp'],['../quantize__ops__gpu_8cpp.html#a18e52d6b9f96ae0c9f7552f54808d958',1,'FBGEMM_OP_DISPATCH(CUDA, "PaddedFP8RowwiseQuantizedToFloat", fbgemm_gpu::_paddedFP8rowwise_to_float_gpu): quantize_ops_gpu.cpp'],['../sparse__async__cumsum_8cu.html#a37ee97bf0cf5f3e51b626963d0905d5d',1,'FBGEMM_OP_DISPATCH(CUDA, "asynchronous_exclusive_cumsum", fbgemm_gpu::asynchronous_exclusive_cumsum_gpu): sparse_async_cumsum.cu'],['../sparse__async__cumsum_8cu.html#a956fe5a496592a618c66c5cdd7e76aee',1,'FBGEMM_OP_DISPATCH(CUDA, "asynchronous_complete_cumsum", fbgemm_gpu::asynchronous_complete_cumsum_gpu): sparse_async_cumsum.cu'],['../sparse__async__cumsum_8cu.html#a1fe1796f45f950ba568e1f5fb38d3da8',1,'FBGEMM_OP_DISPATCH(CUDA, "asynchronous_inclusive_cumsum", fbgemm_gpu::asynchronous_inclusive_cumsum_gpu): sparse_async_cumsum.cu'],['../sparse__block__bucketize__features_8cu.html#ac393348a81fe14a2734e4a221b3e028c',1,'FBGEMM_OP_DISPATCH(CUDA, "block_bucketize_sparse_features", fbgemm_gpu::block_bucketize_sparse_features_cuda): sparse_block_bucketize_features.cu'],['../sparse__bucketize__features_8cu.html#a9f5c60b5d418eded60f0c447ae38c450',1,'FBGEMM_OP_DISPATCH(CUDA, "bucketize_sparse_features", fbgemm_gpu::bucketize_sparse_features_cuda): sparse_bucketize_features.cu'],['../sparse__expand__into__jagged__permute_8cu.html#af4f7b3da9350e95957c452753c2569a7',1,'FBGEMM_OP_DISPATCH(CUDA, "expand_into_jagged_permute", fbgemm_gpu::expand_into_jagged_permute_cuda): sparse_expand_into_jagged_permute.cu'],['../sparse__invert__permute_8cu.html#a472cc598c3ed7832c2866f8aaed5fdc8',1,'FBGEMM_OP_DISPATCH(CUDA, "invert_permute", fbgemm_gpu::invert_permute_cuda): sparse_invert_permute.cu'],['../sparse__permute102_8cu.html#aa5a7770ccd8e2e72012a3035579d2cfc',1,'FBGEMM_OP_DISPATCH(CUDA, "permute102_baddbmm_permute102", fbgemm_gpu::permute102_baddbmm_permute102_cuda): sparse_permute102.cu'],['../sparse__permute__1d_8cu.html#aa28c2751b385fa3416aa12a3dd2cb039',1,'FBGEMM_OP_DISPATCH(CUDA, "permute_1D_sparse_data", fbgemm_gpu::permute_1D_sparse_data_cuda): sparse_permute_1d.cu'],['../sparse__permute__2d_8cu.html#ab884888820b4be2c942de1bf75211b2b',1,'FBGEMM_OP_DISPATCH(CUDA, "permute_sparse_data", fbgemm_gpu::permute_2D_sparse_data_cuda): sparse_permute_2d.cu'],['../sparse__permute__2d_8cu.html#aab7fc0ba2b46743531f3d2fe4392be84',1,'FBGEMM_OP_DISPATCH(CUDA, "permute_2D_sparse_data", fbgemm_gpu::permute_2D_sparse_data_cuda): sparse_permute_2d.cu'],['../sparse__permute__2d_8cu.html#a16728339b915be3a73e7bced8598849f',1,'FBGEMM_OP_DISPATCH(CUDA, "permute_sparse_features", fbgemm_gpu::permute_sparse_features_cuda): sparse_permute_2d.cu'],['../sparse__permute__embeddings_8cu.html#a2281b30913187261c5233174f3f9622c',1,'FBGEMM_OP_DISPATCH(CUDA, "permute_sequence_embeddings", fbgemm_gpu::permute_sequence_embeddings_cuda): sparse_permute_embeddings.cu'],['../sparse__range_8cu.html#a85fc3de0cb5d8acd0c760b984ff30f3b',1,'FBGEMM_OP_DISPATCH(CUDA, "offsets_range", fbgemm_gpu::offsets_range_cuda): sparse_range.cu'],['../sparse__range_8cu.html#a7a62f9a9f0e7b39a3331e3cee8be776e',1,'FBGEMM_OP_DISPATCH(CUDA, "lengths_range", fbgemm_gpu::lengths_range_cuda): sparse_range.cu'],['../sparse__segment__sum__csr_8cu.html#ae64cf20351791f453c8f3156ed01c224',1,'FBGEMM_OP_DISPATCH(CUDA, "segment_sum_csr", fbgemm_gpu::segment_sum_csr_cuda): sparse_segment_sum_csr.cu']]], + ['fixeddivisor_3',['FixedDivisor',['../classfbgemm__gpu_1_1_fixed_divisor.html#a80d1fd876167b0bbb2d6a7ebdaa97270',1,'fbgemm_gpu::FixedDivisor']]], + ['float16_5fmax_4',['float16_max',['../namespacefbgemm__gpu.html#acb046dd929c4c4190894087e0952b6ad',1,'fbgemm_gpu']]], + ['float16_5fmin_5',['float16_min',['../namespacefbgemm__gpu.html#aab696723995ed599860851113bfdae05',1,'fbgemm_gpu']]], + ['float1_5fmax_6',['float1_max',['../namespacefbgemm__gpu.html#a245cd4874d44db0533c14f1e5da13b0d',1,'fbgemm_gpu']]], + ['float1_5fmin_7',['float1_min',['../namespacefbgemm__gpu.html#a3ec9af370f9f9997a31175d653701b82',1,'fbgemm_gpu']]], + ['float2_5fmax_8',['float2_max',['../namespacefbgemm__gpu.html#a75186b0bdaba58d01566eec48d2f6602',1,'fbgemm_gpu']]], + ['float2_5fmin_9',['float2_min',['../namespacefbgemm__gpu.html#aa0397156c968ae38da1e433bfd50d3a3',1,'fbgemm_gpu']]], + ['float4_5fmax_10',['float4_max',['../namespacefbgemm__gpu.html#a7aaeb2b2ad68d85c51fb2b8697c70cc4',1,'fbgemm_gpu']]], + ['float4_5fmin_11',['float4_min',['../namespacefbgemm__gpu.html#adf07e886eabd113338425ed288c06a7b',1,'fbgemm_gpu']]], + ['float8_5fmax_12',['float8_max',['../namespacefbgemm__gpu.html#aa292f064d1126228ac0d10457722616c',1,'fbgemm_gpu']]], + ['float8_5fmin_13',['float8_min',['../namespacefbgemm__gpu.html#abca50cf5035e82d7992586eac7b744cf',1,'fbgemm_gpu']]], + ['float_5for_5fhalf_5fto_5ffused8bitrowwise_5fcpu_14',['float_or_half_to_fused8bitrowwise_cpu',['../group__quantize-data-cpu.html#ga06b7d2bf3fadaa9869555a64a6752ef7',1,'fbgemm_gpu']]], + ['float_5for_5fhalf_5fto_5ffusednbitrowwise_5fcpu_15',['float_or_half_to_fusednbitrowwise_cpu',['../namespacefbgemm__gpu.html#ae983a889f16302029fcc4e5fcd5ce34f',1,'fbgemm_gpu']]], + ['float_5fto_5ffp8rowwise_5fcpu_16',['float_to_FP8rowwise_cpu',['../group__quantize-data-cpu.html#gad540dd7f8ad7601b3d9591114e4ef718',1,'fbgemm_gpu']]], + ['float_5fto_5ffused8bitrowwise_5fcpu_17',['float_to_fused8bitrowwise_cpu',['../group__quantize-data-cpu.html#gacf598456fd7aced63b96e8a725f4c418',1,'fbgemm_gpu']]], + ['float_5fto_5ffusednbitrowwise_5fcpu_18',['float_to_fusednbitrowwise_cpu',['../namespacefbgemm__gpu.html#a9330d767d66b257d1ffa28c67775b38e',1,'fbgemm_gpu']]], + ['float_5fto_5fhfp8_19',['float_to_hfp8',['../namespacefbgemm__gpu.html#a9710845f2dffae8b40b17d49c169976b',1,'fbgemm_gpu']]], + ['float_5fto_5fsto_5fhalf_5fassemblefloat_20',['float_to_sto_half_assemblefloat',['../verify__fp16__stochastic__benchmark_8cu.html#afb0f683c8db4e3b5d5fd504735c60b25',1,'verify_fp16_stochastic_benchmark.cu']]], + ['float_5fto_5fsto_5fhalf_5fbitcarry_21',['float_to_sto_half_bitcarry',['../verify__fp16__stochastic__benchmark_8cu.html#a0fa16f5c4aa1d84c03f25daeb10e9422',1,'verify_fp16_stochastic_benchmark.cu']]], + ['float_5fto_5fsto_5fhalf_5fdirect_22',['float_to_sto_half_direct',['../verify__fp16__stochastic__benchmark_8cu.html#af0a4d95d246fb468f1b26eace73794f3',1,'verify_fp16_stochastic_benchmark.cu']]], + ['float_5fto_5fsto_5fhalf_5fshortrand_23',['float_to_sto_half_shortrand',['../verify__fp16__stochastic__benchmark_8cu.html#aecab575916373f334a644238b6e02cf2',1,'verify_fp16_stochastic_benchmark.cu']]], + ['floattobfloat16quantized_5fref_24',['FloatToBFloat16Quantized_ref',['../namespacefbgemm__gpu.html#a46f430eb3d28bcd3fed6fbc61dec3bda',1,'fbgemm_gpu']]], + ['floattofp8quantized_5fref_25',['FloatToFP8Quantized_ref',['../group__quantize-data-cpu.html#gad14f49d191c7960681206b7103d781c4',1,'fbgemm_gpu']]], + ['floattofp8rowwisequantized_5fmeta_26',['FloatToFP8RowwiseQuantized_meta',['../namespacefbgemm__gpu.html#a5a525ef518134e136f23ab964d45dc23',1,'fbgemm_gpu']]], + ['flush_27',['flush',['../classssd_1_1_embedding_rocks_d_b.html#adac116554b543b7c4228c018a85882f5',1,'ssd::EmbeddingRocksDB']]], + ['flush_5fcache_28',['flush_cache',['../verify__fp16__stochastic__benchmark_8cu.html#a65d8faf79602cb52dbf1c3dc90db0cbd',1,'flush_cache(std::vector< char > flush, char *d_flush, char *d_flush2, int cache_size, bool do_write=false): verify_fp16_stochastic_benchmark.cu'],['../bench__utils_8cuh.html#a7fcbe2b8cc9b7676bb24b328fd41dc3a',1,'flush_cache(int cache_size_mb=40, bool do_write=false): bench_utils.cuh']]], + ['flush_5fgpu_29',['flush_gpu',['../verify__fp16__stochastic__benchmark_8cu.html#ab211bd95de3d67a08c95c5d7f070dfcb',1,'verify_fp16_stochastic_benchmark.cu']]], + ['flush_5fif_5fnecessary_30',['flush_if_necessary',['../classssd_1_1_embedding_rocks_d_b.html#a5e5bb9c575c52445a77bd0c39afc50bb',1,'ssd::EmbeddingRocksDB']]], + ['fma_31',['fma',['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#ad5c1e8194ecc27d73fb5477bc6795df8',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::fma()'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#af82504393e0e09a157a40980598f626b',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::fma()'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#ad3b821b9b1862e7970a798dcc105dce8',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::fma()'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#a3198b30904d0e23bf46c12eabf628e9e',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::fma()'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#a3198b30904d0e23bf46c12eabf628e9e',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::fma()'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#a3198b30904d0e23bf46c12eabf628e9e',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::fma()'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#a5779758db0a3dea1eb734fb1cbf9670d',1,'fbgemm_gpu::Vec4AccT::fma(const float4 *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#ad0817540a257625fecb7890a0ed2533c',1,'fbgemm_gpu::Vec4AccT::fma(const float2 *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#acf03f270b01757bf3c12309e398fc663',1,'fbgemm_gpu::Vec4AccT::fma(const uint8_t *ptr, const float weight)'],['../embedding__forward__split__kernel__v2__template_8cu.html#a2a539cccb1f62bb145cef234b6608c7f',1,'fma(): embedding_forward_split_kernel_v2_template.cu']]], + ['fma_5f_32',['fma_',['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#af7ca249b197579ed0c1e65179d406b92',1,'fbgemm_gpu::Vec4T< float >::fma_()'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a5914148b281516a23c9786a11d6675ad',1,'fbgemm_gpu::Vec4T< at::Half >::fma_(const Vec4T< at::Half > &a, const float b)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#af7ca249b197579ed0c1e65179d406b92',1,'fbgemm_gpu::Vec4T< at::Half >::fma_(const Vec4T< float > &a, const float b)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a5914148b281516a23c9786a11d6675ad',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::fma_(const Vec4T< at::Half > &a, const float b)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#af7ca249b197579ed0c1e65179d406b92',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::fma_(const Vec4T< float > &a, const float b)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a935586b35f2e7d90ec234784a8a5d2b8',1,'fbgemm_gpu::Vec4T< double >::fma_()'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#ad1ed20d954c2af00a7af0011bb652f42',1,'fbgemm_gpu::Vec4AccT::fma_(const float *vals, const float weight)'],['../structfbgemm__gpu_1_1_vec4_acc_t.html#abe8fde8cd9a20ff924fd33e7d16eaa42',1,'fbgemm_gpu::Vec4AccT::fma_(const half *vals, const float weight)']]], + ['for_33',['for',['../namespacefbgemm__gpu.html#af2287d510f303567f2d28d743aa716b6',1,'fbgemm_gpu']]], + ['forward_34',['forward',['../classfbgemm__gpu_1_1_permute_pooled_embs_function.html#a278e4d6a68c0e694370831a0d04a2918',1,'fbgemm_gpu::PermutePooledEmbsFunction::forward()'],['../classfbgemm__gpu_1_1_permute_pooled_embs_function_split.html#a83e4292464a5708945ca80a1f2171a27',1,'fbgemm_gpu::PermutePooledEmbsFunctionSplit::forward()']]], + ['fp8quantizedtofloat_5fref_35',['FP8QuantizedToFloat_ref',['../group__quantize-data-cpu.html#ga4c49e527f364bfa224ed34f4fe9f13e7',1,'fbgemm_gpu']]], + ['fp8rowwise_5fto_5ffloat_5fcpu_36',['FP8rowwise_to_float_cpu',['../group__quantize-data-cpu.html#ga1d3b2f7c37e8755516ff8a4c504017e1',1,'fbgemm_gpu']]], + ['fp8rowwise_5fto_5ffloat_5fmeta_37',['FP8rowwise_to_float_meta',['../namespacefbgemm__gpu.html#ae7fdacc8f9e0ec9e1ede8102876ab537',1,'fbgemm_gpu']]], + ['fused8bitrowwise_5fto_5ffloat_5fcpu_38',['fused8bitrowwise_to_float_cpu',['../group__quantize-data-cpu.html#gab86a824fed15fab1c318359d069a5180',1,'fbgemm_gpu']]], + ['fused8bitrowwise_5fto_5ffloat_5for_5fhalf_5fcpu_39',['fused8bitrowwise_to_float_or_half_cpu',['../group__quantize-data-cpu.html#gad219617d0aa308f97fad8dfc6af20213',1,'fbgemm_gpu']]], + ['fused8bitrowwise_5fto_5fhalf_5fcpu_40',['fused8bitrowwise_to_half_cpu',['../group__quantize-data-cpu.html#ga9284d774f5d4087da98453e96e64d00a',1,'fbgemm_gpu']]], + ['fused8bitrowwise_5fto_5fhalf_5fcpu_5fout_41',['fused8bitrowwise_to_half_cpu_out',['../namespacefbgemm__gpu.html#a389ed2b83ea0f408fe19fbb46770c610',1,'fbgemm_gpu']]], + ['fusednbitrowwise_5fto_5ffloat_5fcpu_42',['fusednbitrowwise_to_float_cpu',['../group__quantize-data-cpu.html#ga61c494baf4e410652ed897534d14aa29',1,'fbgemm_gpu']]], + ['fusednbitrowwise_5fto_5ffloat_5for_5fhalf_5fcpu_43',['fusednbitrowwise_to_float_or_half_cpu',['../group__quantize-data-cpu.html#ga5bd66d69876ef2493a6ebb4346c31bb9',1,'fbgemm_gpu']]], + ['fusednbitrowwise_5fto_5fhalf_5fcpu_44',['fusednbitrowwise_to_half_cpu',['../group__quantize-data-cpu.html#ga1c32bf52a02928dbc573b4ac67065788',1,'fbgemm_gpu']]] ]; diff --git a/search/functions_7.js b/search/functions_7.js index cbff7a2c9..09ae2f220 100644 --- a/search/functions_7.js +++ b/search/functions_7.js @@ -1,14 +1,33 @@ var searchData= [ - ['lfu_5fcache_5fpopulate_5fbyte_5fcuda_0',['lfu_cache_populate_byte_cuda',['../group__table-batched-embed-cuda.html#ga2b76a0cf452f00e77696d896d7a402f3',1,'lfu_cache_populate_byte_cuda(Tensor weights, Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor linear_cache_indices, Tensor lxu_cache_state, Tensor lxu_cache_weights, Tensor lfu_state, int64_t row_alignment): lfu_cache_populate_byte.cu'],['../group__table-batched-embed-cuda.html#ga2b76a0cf452f00e77696d896d7a402f3',1,'lfu_cache_populate_byte_cuda(at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, int64_t row_alignment): lfu_cache_populate_byte.cu']]], - ['lfu_5fcache_5fpopulate_5fcuda_1',['lfu_cache_populate_cuda',['../group__table-batched-embed-cuda.html#ga854b8951ef7e78da812be97041d7d2dc',1,'lfu_cache_populate_cuda(at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, bool stochastic_rounding): lfu_cache_populate.cu'],['../group__table-batched-embed-cuda.html#ga854b8951ef7e78da812be97041d7d2dc',1,'lfu_cache_populate_cuda(Tensor weights, Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor D_offsets, Tensor linear_cache_indices, Tensor lxu_cache_state, Tensor lxu_cache_weights, Tensor lfu_state, bool stochastic_rounding): lfu_cache_populate.cu']]], - ['linearize_5fcache_5findices_5fcuda_2',['linearize_cache_indices_cuda',['../group__table-batched-embed-cuda.html#ga23e7545e51b296d9b72c86f37c360dc6',1,'linearize_cache_indices_cuda(at::Tensor cache_hash_size_cumsum, at::Tensor indices, at::Tensor offsets): linearize_cache_indices.cu'],['../group__table-batched-embed-cuda.html#ga23e7545e51b296d9b72c86f37c360dc6',1,'linearize_cache_indices_cuda(Tensor cache_hash_size_cumsum, Tensor indices, Tensor offsets): linearize_cache_indices.cu']]], - ['linearize_5fcache_5findices_5ffrom_5frow_5fidx_5fcuda_3',['linearize_cache_indices_from_row_idx_cuda',['../group__table-batched-embed-cuda.html#ga6eed85d3e9b5dbef8a753bb81c2d6e05',1,'linearize_cache_indices_from_row_idx_cuda(at::Tensor cache_hash_size_cumsum, at::Tensor update_table_indices, at::Tensor update_row_indices): linearize_cache_indices.cu'],['../group__table-batched-embed-cuda.html#ga6eed85d3e9b5dbef8a753bb81c2d6e05',1,'linearize_cache_indices_from_row_idx_cuda(Tensor cache_hash_size_cumsum, Tensor update_table_indices, Tensor update_row_indices): linearize_cache_indices.cu']]], - ['lru_5fcache_5ffind_5funcached_5fcuda_4',['lru_cache_find_uncached_cuda',['../group__table-batched-embed-cuda.html#ga76807cfe283a9e8f258818f3f439e6cd',1,'lru_cache_find_uncached_cuda(Tensor unique_indices, Tensor unique_indices_length, int64_t max_indices, Tensor lxu_cache_state, int64_t time_stamp, Tensor lru_state, bool gather_cache_stats, Tensor uvm_cache_stats, bool lock_cache_line, Tensor lxu_cache_locking_counter): lru_cache_find.cu'],['../group__table-batched-embed-cuda.html#ga76807cfe283a9e8f258818f3f439e6cd',1,'lru_cache_find_uncached_cuda(at::Tensor unique_indices, at::Tensor unique_indices_length, int64_t max_indices, at::Tensor lxu_cache_state, int64_t time_stamp, at::Tensor lru_state, bool gather_cache_stats, at::Tensor uvm_cache_stats, bool lock_cache_line, at::Tensor lxu_cache_locking_counter): lru_cache_find.cu']]], - ['lru_5fcache_5fpopulate_5fbyte_5fcuda_5',['lru_cache_populate_byte_cuda',['../group__table-batched-embed-cuda.html#ga5958e4cecc978d415714a3dd691fbc11',1,'split_embeddings_cache_cuda.cuh']]], - ['lru_5fcache_5fpopulate_5fcuda_6',['lru_cache_populate_cuda',['../group__table-batched-embed-cuda.html#ga00d12767ad238d73598bf7dc4d1afa06',1,'split_embeddings_cache_cuda.cuh']]], - ['lxu_5fcache_5fflush_5fcuda_7',['lxu_cache_flush_cuda',['../group__table-batched-embed-cuda.html#ga2b055aeb5bf2d99bfb4351271764cab1',1,'lxu_cache_flush_cuda(at::Tensor uvm_weights, at::Tensor cache_hash_size_cumsum, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, int64_t total_D, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, bool stochastic_rounding): lxu_cache.cu'],['../group__table-batched-embed-cuda.html#ga2b055aeb5bf2d99bfb4351271764cab1',1,'lxu_cache_flush_cuda(Tensor uvm_weights, Tensor cache_hash_size_cumsum, Tensor cache_index_table_map, Tensor weights_offsets, Tensor D_offsets, int64_t total_D, Tensor lxu_cache_state, Tensor lxu_cache_weights, bool stochastic_rounding): lxu_cache.cu']]], - ['lxu_5fcache_5flocations_5fupdate_5fcuda_8',['lxu_cache_locations_update_cuda',['../group__table-batched-embed-cuda.html#ga65cba33a439fb1ed50fe2e80dc22b603',1,'split_embeddings_cache_cuda.cuh']]], - ['lxu_5fcache_5flocking_5fcounter_5fdecrement_5fcuda_9',['lxu_cache_locking_counter_decrement_cuda',['../group__table-batched-embed-cuda.html#gaeaf8f13290f0fe389fefa3fc2a944311',1,'lxu_cache_locking_counter_decrement_cuda(at::Tensor lxu_cache_locking_counter, at::Tensor lxu_cache_locations): lxu_cache.cu'],['../group__table-batched-embed-cuda.html#gaeaf8f13290f0fe389fefa3fc2a944311',1,'lxu_cache_locking_counter_decrement_cuda(at::Tensor lxu_cache_locking_counter, at::Tensor lxu_cache_locations): lxu_cache.cu']]], - ['lxu_5fcache_5flookup_5fcuda_10',['lxu_cache_lookup_cuda',['../group__table-batched-embed-cuda.html#ga124b70b0fede88f508e59111ce6d765f',1,'split_embeddings_cache_cuda.cuh']]] + ['gen_5f8bit_5frandom_0',['gen_8bit_random',['../verify__fp16__stochastic__benchmark_8cu.html#aa292d49c7c13666d79ff4c646b5284f0',1,'verify_fp16_stochastic_benchmark.cu']]], + ['gen_5fdata_1',['gen_data',['../verify__fp16__stochastic__benchmark_8cu.html#ab5c51c16cea74c9decd6a2c957b515d9',1,'verify_fp16_stochastic_benchmark.cu']]], + ['generate_5flxu_5fcache_5flocations_2',['generate_lxu_cache_locations',['../uvm__cache__miss__emulate__test_8cpp.html#ad12ee38ec43f8659ee8ce4f63f3857f4',1,'uvm_cache_miss_emulate_test.cpp']]], + ['generate_5frandom_5ftable_3',['generate_random_table',['../bench__utils_8cuh.html#a0899793cc86846edfa6ccefb7905f55c',1,'bench_utils.cuh']]], + ['generate_5fvbe_5fmetadata_4',['generate_vbe_metadata',['../split__embeddings__utils_8cuh.html#ae0dcbedd529d5873ad0cac75397cb1f8',1,'generate_vbe_metadata(const at::Tensor &B_offsets, const at::Tensor &B_offsets_rank_per_feature, const at::Tensor &output_offsets_feature_rank, const at::Tensor &D_offsets, const int64_t D, const bool nobag, const int64_t max_B_feature_rank, const int64_t info_B_num_bits, const int64_t total_B): generate_vbe_metadata.cu'],['../generate__vbe__metadata_8cu.html#a9c89bc26edc2d2f4014204d89bd846eb',1,'generate_vbe_metadata(const Tensor &B_offsets, const Tensor &B_offsets_rank_per_feature, const Tensor &output_offsets_feature_rank, const Tensor &D_offsets, const int64_t D, const bool nobag, const int64_t max_B_feature_rank, const int64_t info_B_num_bits, const int64_t total_B): generate_vbe_metadata.cu']]], + ['generic_5fhistogram_5fbinning_5fcalibration_5fby_5ffeature_5fcpu_5',['generic_histogram_binning_calibration_by_feature_cpu',['../group__sparse-data-cpu.html#gaef2a0a8c27e3b8b2d72be5c95ba7539e',1,'fbgemm_gpu']]], + ['generic_5fhistogram_5fbinning_5fcalibration_5fby_5ffeature_5fcuda_6',['generic_histogram_binning_calibration_by_feature_cuda',['../namespacefbgemm__gpu.html#af9209d9d3ea127b5941dcab75bbfd39c',1,'fbgemm_gpu']]], + ['genericpackedtensoraccessor_7',['GenericPackedTensorAccessor',['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor.html#a05cb3acde0a408e40526aad85584b274',1,'fbgemm_gpu::GenericPackedTensorAccessor::GenericPackedTensorAccessor(PtrType data, const index_t *const sizes, const index_t *const strides, const char *const ptr_name, const char *const func_name)'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor.html#aa8ff94c7184e151415673957258747e2',1,'fbgemm_gpu::GenericPackedTensorAccessor::GenericPackedTensorAccessor(PtrType data, const source_index_t *const sizes, const source_index_t *const strides, const char *const ptr_name, const char *const func_name)'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html#a05cb3acde0a408e40526aad85584b274',1,'fbgemm_gpu::GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >::GenericPackedTensorAccessor(PtrType data, const index_t *const sizes, const index_t *const strides, const char *const ptr_name, const char *const func_name)'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html#aa8ff94c7184e151415673957258747e2',1,'fbgemm_gpu::GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >::GenericPackedTensorAccessor(PtrType data, const source_index_t *const sizes, const source_index_t *const strides, const char *const ptr_name, const char *const func_name)']]], + ['genericpackedtensoraccessorbase_8',['GenericPackedTensorAccessorBase',['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#ad3b41b3123d1d8bfc0e530b2323dde07',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::GenericPackedTensorAccessorBase(PtrType data, const index_t *const sizes, const index_t *const strides, const char *const ptr_name, const char *const func_name)'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_base.html#aab339f541ab3ce6195cabda68f736598',1,'fbgemm_gpu::GenericPackedTensorAccessorBase::GenericPackedTensorAccessorBase(PtrType data, const source_index_t *const sizes, const source_index_t *const strides, const char *const ptr_name, const char *const func_name)']]], + ['get_9',['get',['../classssd_1_1_embedding_rocks_d_b.html#a9a5671e5de645f247452456ffdfa81a9',1,'ssd::EmbeddingRocksDB']]], + ['get_5fcuda_10',['get_cuda',['../classssd_1_1_embedding_rocks_d_b.html#ac8082829ce873543f6388ddbd16362e8',1,'ssd::EmbeddingRocksDB']]], + ['get_5fd_5fbytes_11',['get_D_bytes',['../embedding__inplace__update__test_8cpp.html#a602d9bde988d40aaa1d846c76f8d87c7',1,'embedding_inplace_update_test.cpp']]], + ['get_5fdevice_5findex_5ffrom_5ftensor_12',['get_device_index_from_tensor',['../sparse__ops__utils_8h.html#a672c3da6666124b2950b2eef43587bc6',1,'get_device_index_from_tensor(const at::Tensor &ten): sparse_ops_utils.h'],['../sparse__ops__utils_8h.html#af97638412af3aea185ac327ebe398542',1,'get_device_index_from_tensor(const c10::optional< at::Tensor > &ten): sparse_ops_utils.h']]], + ['get_5fgroup_5findex_5fselect_5fcols_5fper_5fwarp_13',['get_group_index_select_cols_per_warp',['../namespacefbgemm__gpu.html#a4296f0fdcb9a3dcfdd67549340e8f38c',1,'fbgemm_gpu']]], + ['get_5finfos_5fmetadata_14',['get_infos_metadata',['../split__embeddings__utils_8cuh.html#a0994f8d37247e9754d069f16ee195c01',1,'get_infos_metadata(at::Tensor unused, int64_t B, int64_t T): get_infos_metadata.cu'],['../get__infos__metadata_8cu.html#a487bdb340f5c93165158a37aaf156fe9',1,'get_infos_metadata(Tensor unused, int64_t B, int64_t T): get_infos_metadata.cu']]], + ['get_5fnext_5fbag_5fboundary_5fand_5fl_15',['get_next_bag_boundary_and_L',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a7cc9e1627beb86ecc866da06957e0fff',1,'get_next_bag_boundary_and_L(const uint32_t bag_boundary, int32_t *const next_boundary, uint32_t *const L): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a7cc9e1627beb86ecc866da06957e0fff',1,'get_next_bag_boundary_and_L(const uint32_t bag_boundary, int32_t *const next_boundary, uint32_t *const L): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../embedding__forward__split__kernel__v2__template_8cu.html#a7cc9e1627beb86ecc866da06957e0fff',1,'get_next_bag_boundary_and_L(const uint32_t bag_boundary, int32_t *const next_boundary, uint32_t *const L): embedding_forward_split_kernel_v2_template.cu']]], + ['get_5fnvlink_5fmatrix_16',['get_nvlink_matrix',['../namespacefbgemm__gpu.html#ae554e4e9d8789449846323c52f840fe8',1,'fbgemm_gpu']]], + ['get_5funique_5findices_5fcuda_17',['get_unique_indices_cuda',['../group__table-batched-embed-cuda.html#ga4887151424a90cfd0abef174a4e91f3f',1,'get_unique_indices_cuda(at::Tensor linear_indices, int64_t max_indices, bool compute_count): linearize_cache_indices.cu'],['../group__table-batched-embed-cuda.html#ga4887151424a90cfd0abef174a4e91f3f',1,'get_unique_indices_cuda(Tensor linear_indices, int64_t max_indices, bool compute_count): linearize_cache_indices.cu']]], + ['get_5fvalid_5fcpu_5ftensor_18',['get_valid_cpu_tensor',['../sparse__ops__utils__test_8cpp.html#a740d263ecb80b6e7cf28a86f561450b7',1,'sparse_ops_utils_test.cpp']]], + ['getpointer_19',['getPointer',['../structfbgemm__gpu_1_1_shared_memory_3_01int64__t_01_4.html#ac04ebca5545952c6185a2693bc5d9fc9',1,'fbgemm_gpu::SharedMemory< int64_t >::getPointer()'],['../structfbgemm__gpu_1_1_shared_memory_3_01int32__t_01_4.html#a3472f2fcb0b65202627a7a5d0b47ab8f',1,'fbgemm_gpu::SharedMemory< int32_t >::getPointer()'],['../structfbgemm__gpu_1_1_shared_memory_3_01float_01_4.html#a11507d418a31c798c09f74aa6569fb72',1,'fbgemm_gpu::SharedMemory< float >::getPointer()'],['../structfbgemm__gpu_1_1_shared_memory_3_01double_01_4.html#a53ef47c469305fb8b5427b2a0063db6f',1,'fbgemm_gpu::SharedMemory< double >::getPointer()'],['../structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01float_00_01true_01_4_01_4_01_4.html#aa277fc58794548c1d2619afa9cd0be9e',1,'fbgemm_gpu::SharedMemory< Vec4T< at::acc_type< float, true > > >::getPointer()'],['../structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01double_00_01true_01_4_01_4_01_4.html#a79e2902e4ab8379789578754af90253f',1,'fbgemm_gpu::SharedMemory< Vec4T< at::acc_type< double, true > > >::getPointer()']]], + ['getscalartype_20',['getScalarType',['../namespacefbgemm__gpu.html#ac7d6b4d86c0ce57c3af88ea03123fdb4',1,'fbgemm_gpu']]], + ['getsparsetype_21',['getSparseType',['../namespacefbgemm__gpu.html#a7dbc3a3bde83bfe7a18b720197f0f830',1,'fbgemm_gpu']]], + ['gpuatomicincrement_22',['gpuAtomicIncrement',['../embedding__backward__template__helpers_8cuh.html#aa054bfcfa5ed7f584d2811fe48a2f757',1,'embedding_backward_template_helpers.cuh']]], + ['group_5findex_5fselect_5fdim0_5fgpu_23',['group_index_select_dim0_gpu',['../namespacefbgemm__gpu.html#a33cd874aab109dc15436869064c3d689',1,'fbgemm_gpu']]], + ['group_5findex_5fselect_5fdim0_5fgpu_5fbackward_5fmeta_24',['group_index_select_dim0_gpu_backward_meta',['../namespacefbgemm__gpu.html#a213539d8845a20efd90e93fed16f1090',1,'fbgemm_gpu']]], + ['group_5findex_5fselect_5fdim0_5fgpu_5fimpl_25',['group_index_select_dim0_gpu_impl',['../namespacefbgemm__gpu.html#abda14dada6ae2b39b175ed52824dbfa5',1,'fbgemm_gpu']]], + ['group_5findex_5fselect_5fdim0_5fgpu_5fimpl_5fmeta_26',['group_index_select_dim0_gpu_impl_meta',['../namespacefbgemm__gpu.html#a8d89670eae5b860788cb14175f01ce7e',1,'fbgemm_gpu']]], + ['group_5findex_5fselect_5fdim0_5funpack_27',['group_index_select_dim0_unpack',['../namespacefbgemm__gpu.html#ac4851777dc16c28c94a2cc9b58d3923c',1,'fbgemm_gpu']]], + ['group_5findex_5fselect_5for_5fadd_5fcuda_28',['group_index_select_or_add_cuda',['../namespacefbgemm__gpu.html#a394db33cacde2480607d48fe227274ef',1,'fbgemm_gpu']]], + ['gt_29',['gt',['../structfbgemm__gpu_1_1_comparator.html#a869e6734f5357dab7a63300629b414c8',1,'fbgemm_gpu::Comparator']]] ]; diff --git a/search/functions_8.js b/search/functions_8.js index 9ca306e65..4c46567db 100644 --- a/search/functions_8.js +++ b/search/functions_8.js @@ -1,5 +1,15 @@ var searchData= [ - ['new_5fmanaged_5ftensor_0',['new_managed_tensor',['../group__cumem-utils.html#gab708b23762a11187eb6a32a36f0e34a3',1,'fbgemm_gpu']]], - ['new_5fvanilla_5fmanaged_5ftensor_1',['new_vanilla_managed_tensor',['../group__cumem-utils.html#gad5e0d2307667c3db5e73f0c0eec15df5',1,'fbgemm_gpu']]] + ['half_5fto_5ffused8bitrowwise_5fcpu_0',['half_to_fused8bitrowwise_cpu',['../group__quantize-data-cpu.html#gaa9daf4f3dc64238a5de8f82bbae656cf',1,'fbgemm_gpu']]], + ['half_5fto_5ffusednbitrowwise_5fcpu_1',['half_to_fusednbitrowwise_cpu',['../namespacefbgemm__gpu.html#a545dc5567b0a08c31f65e2fc7ae21749',1,'fbgemm_gpu']]], + ['hfma2_2',['hfma2',['../namespacefbgemm__gpu.html#a3ff3d0d7b40d8f2909fa6b35d64d250d',1,'fbgemm_gpu']]], + ['hfp8_5fto_5ffloat_3',['hfp8_to_float',['../namespacefbgemm__gpu.html#a1f35a2d3a2ede2e58e7986f8c2c757ec',1,'fbgemm_gpu']]], + ['histogram_5fbinning_5fcalibration_5fby_5ffeature_5fcpu_4',['histogram_binning_calibration_by_feature_cpu',['../namespacefbgemm__gpu.html#a499764d7156d294219e3ae2629ae229f',1,'fbgemm_gpu']]], + ['histogram_5fbinning_5fcalibration_5fby_5ffeature_5fcuda_5',['histogram_binning_calibration_by_feature_cuda',['../namespacefbgemm__gpu.html#ac639ce2e71982d5d1da0a30c92858aa8',1,'fbgemm_gpu']]], + ['histogram_5fbinning_5fcalibration_5fcpu_6',['histogram_binning_calibration_cpu',['../group__sparse-data-cpu.html#ga201bb2241fc9d582d6c0fe968b0e71ca',1,'fbgemm_gpu']]], + ['histogram_5fbinning_5fcalibration_5fcuda_7',['histogram_binning_calibration_cuda',['../namespacefbgemm__gpu.html#a1b19059704ba1911efbedf4adcbb0ee3',1,'fbgemm_gpu']]], + ['hmul_8',['hmul',['../namespacefbgemm__gpu.html#ab50e28187eb7fdf5b8cd74cd8150b025',1,'fbgemm_gpu']]], + ['hmul_5fshort2_9',['hmul_short2',['../namespacefbgemm__gpu.html#a257181e3db25da8e4d1b4ef73976271d',1,'fbgemm_gpu']]], + ['host_5flxu_5fcache_5fslot_10',['host_lxu_cache_slot',['../group__table-batched-embed-cuda.html#ga920da453c443675fc7fbc9d68e272a61',1,'host_lxu_cache_slot(int64_t h_in, int64_t C): lxu_cache.cu'],['../group__table-batched-embed-cuda.html#ga920da453c443675fc7fbc9d68e272a61',1,'host_lxu_cache_slot(int64_t h_in, int64_t C): lxu_cache.cu']]], + ['hostasynchronousthreadpoolexecutor_11',['hostAsynchronousThreadPoolExecutor',['../namespacessd.html#ac14b5cc833767dd1941b5c2de7153299',1,'ssd']]] ]; diff --git a/search/functions_9.js b/search/functions_9.js index d48573eb8..ff754bff3 100644 --- a/search/functions_9.js +++ b/search/functions_9.js @@ -1,10 +1,27 @@ var searchData= [ - ['uvm_5fcuda_5fmem_5fadvise_0',['uvm_cuda_mem_advise',['../group__cumem-utils.html#ga8a7d93d58bcc9700c3054639973e25b6',1,'fbgemm_gpu']]], - ['uvm_5fcuda_5fmem_5fprefetch_5fasync_1',['uvm_cuda_mem_prefetch_async',['../group__cumem-utils.html#ga07e32d271464bafc50cc100cb52ddb85',1,'fbgemm_gpu']]], - ['uvm_5fmem_5fadvice_5fdont_5ffork_2',['uvm_mem_advice_dont_fork',['../group__cumem-utils.html#ga723bf5f1a0ca1c7a77e76054d3332a6e',1,'fbgemm_gpu']]], - ['uvm_5fstorage_3',['uvm_storage',['../group__cumem-utils.html#ga6e119375c731f9e33f4cd81a1f2205e2',1,'fbgemm_gpu']]], - ['uvm_5fto_5fcpu_4',['uvm_to_cpu',['../group__cumem-utils.html#ga6d4781dfa6a77b895140836f6e6d523b',1,'fbgemm_gpu']]], - ['uvm_5fto_5fcpu_5fclone_5',['uvm_to_cpu_clone',['../group__cumem-utils.html#ga98ea4dd0481cc3839cf21e55e003e7af',1,'fbgemm_gpu']]], - ['uvm_5fto_5fdevice_6',['uvm_to_device',['../group__cumem-utils.html#gaad51bd52cc92230c0e91c5d4f61511c2',1,'fbgemm_gpu']]] + ['if_0',['if',['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a618af795eb1829b78b342e084130e1f4',1,'if(t >=T): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a37c3fe73e60868097d45b151e9c4a430',1,'if(is_zero_total_L): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a49e7c05f68f0175f3c44c6b1c12c5117',1,'if(is_small_L &&table_warp_id >=num_warps_for_small_L *8): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a1958ec7365ff8575f7973e15353c0121',1,'if(threadIdx.x==0): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a509435224d0201170dbceeef2d47698f',1,'if(table_warp_id >=num_warps_per_row *(is_small_L ? num_warps_for_small_L :B)): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#a4ab8250d245b6612c02d934b63fdcd52',1,'if(is_small_L): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#aaa19ed116a2acf1b1ef0527b77b3d4ec',1,'if(L<=1): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__unweighted__v2__kernel_8cu.html#af6822b01edff1e16c53f21b0c6142ffd',1,'if(load_D - load_d< kWarpSize): gen_embedding_forward_split_unweighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a618af795eb1829b78b342e084130e1f4',1,'if(t >=T): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a37c3fe73e60868097d45b151e9c4a430',1,'if(is_zero_total_L): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a49e7c05f68f0175f3c44c6b1c12c5117',1,'if(is_small_L &&table_warp_id >=num_warps_for_small_L *8): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a1958ec7365ff8575f7973e15353c0121',1,'if(threadIdx.x==0): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a509435224d0201170dbceeef2d47698f',1,'if(table_warp_id >=num_warps_per_row *(is_small_L ? num_warps_for_small_L :B)): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#a4ab8250d245b6612c02d934b63fdcd52',1,'if(is_small_L): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../gen__embedding__forward__split__weighted__v2__kernel_8cu.html#aaa19ed116a2acf1b1ef0527b77b3d4ec',1,'if(L<=1): gen_embedding_forward_split_weighted_v2_kernel.cu'],['../bench__utils_8cuh.html#aa3487d3e764e516ac71de417077959a6',1,'if(do_write *val): bench_utils.cuh'],['../namespacefbgemm__gpu.html#a29ef435892df0dc6cd3fa9769486e659',1,'fbgemm_gpu::if(index >=num_logits)'],['../namespacefbgemm__gpu.html#a32dace4feb1fa305053fd440163ba422',1,'fbgemm_gpu::if(curr_bin_num_examples > bin_ctr_in_use_after)'],['../namespacefbgemm__gpu.html#a4b4f7604af9accc2a43a8e060b6145e7',1,'fbgemm_gpu::if(index >=num_lengths - 1)'],['../namespacefbgemm__gpu.html#ac2276128422f0c744cc68659b731d53a',1,'fbgemm_gpu::if(next_offset==curr_offset+1)'],['../namespacefbgemm__gpu.html#a1d72e092775be40f6a57865b410d55e9',1,'fbgemm_gpu::if(list_id >=num_lists)'],['../namespacefbgemm__gpu.html#aa41e0708c4b465d4a89e0c1de6a60dd1',1,'fbgemm_gpu::if(per_sample_weights_addrs)'],['../namespacefbgemm__gpu.html#a6080a87e4588877fbbdd8a03d16d927d',1,'fbgemm_gpu::if(b >=B)'],['../namespacefbgemm__gpu.html#a9e204163946d36c19beef5443a1b71b6',1,'fbgemm_gpu::if(n >=N)'],['../namespacefbgemm__gpu.html#aa6453091b8359fcc2da599396bb27f52',1,'fbgemm_gpu::if(run_id >=sorted_linear_indices_run.size(0))'],['../namespacefbgemm__gpu.html#ad0904756703f278e8c03d0be1918211b',1,'fbgemm_gpu::if(run_id >=sorted_linear_indices_num_runs[0])'],['../namespacefbgemm__gpu.html#aaf49df4f26b7eff1308265a096c0c768',1,'fbgemm_gpu::if(SL==0)'],['../namespacefbgemm__gpu.html#a426625b7d5c06c4059e34784c1fdd74f',1,'fbgemm_gpu::if(t >=T||b >=batch_size_per_feature[t])'],['../namespacefbgemm__gpu.html#ae198c10fa781aa859c0e8666fc10063b',1,'fbgemm_gpu::if(i >=input_size)'],['../namespacefbgemm__gpu.html#a1958ec7365ff8575f7973e15353c0121',1,'fbgemm_gpu::if(threadIdx.x==0)']]], + ['inclusive_5fsum_5fscan_5fkernel_1',['inclusive_sum_scan_kernel',['../namespacefbgemm__gpu.html#ae86238f4ca864fb4ea41318ece747ab4',1,'fbgemm_gpu']]], + ['index_5fadd_2',['index_add',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#a2f087d87df54652b9059bfa56b7c0dc3',1,'fbgemm_gpu::Vec4StepT< STEP, float >::index_add()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#a2f087d87df54652b9059bfa56b7c0dc3',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::index_add()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#a2f087d87df54652b9059bfa56b7c0dc3',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::index_add()']]], + ['index_5fadd_5fwith_5funique_5findices_5fcuda_3',['index_add_with_unique_indices_cuda',['../namespacefbgemm__gpu.html#a80e08c6c5c1ebf2b34c6490eee0e8415',1,'fbgemm_gpu']]], + ['index_5ffma_4',['index_fma',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#ad71e777976812302bf4173ce00641b55',1,'fbgemm_gpu::Vec4StepT< STEP, float >::index_fma()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#ad71e777976812302bf4173ce00641b55',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::index_fma()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#ad71e777976812302bf4173ce00641b55',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::index_fma()']]], + ['index_5fselect_5fcuda_5',['index_select_cuda',['../namespacefbgemm__gpu.html#a543ba161110516ef84a9fbeb83c7af5c',1,'fbgemm_gpu']]], + ['index_5fselect_5fdim0_5fgpu_6',['index_select_dim0_gpu',['../namespacefbgemm__gpu.html#a170ff30798a3bcf42cc3f0669f938450',1,'fbgemm_gpu']]], + ['index_5fselect_5fscalar_5fcumsum_5fkernel_7',['index_select_scalar_cumsum_kernel',['../namespacefbgemm__gpu.html#aa762379def70fcfe1f15ff2a347af4a9',1,'fbgemm_gpu']]], + ['index_5fstore_8',['index_store',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#a53ce8d22f3e5051594ff8799ede7167a',1,'fbgemm_gpu::Vec4StepT< STEP, float >::index_store(uint32_t idx, float4 *ptr)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#a3a736a75cd874d0a755c64bc2d5dbf36',1,'fbgemm_gpu::Vec4StepT< STEP, float >::index_store(uint32_t idx, float2 *ptr)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#ab6ad661dbc7d9699747b0ec4f268c92c',1,'fbgemm_gpu::Vec4StepT< STEP, float >::index_store(uint32_t idx, uint8_t *ptr)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#a53ce8d22f3e5051594ff8799ede7167a',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::index_store(uint32_t idx, float4 *ptr)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#a3a736a75cd874d0a755c64bc2d5dbf36',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::index_store(uint32_t idx, float2 *ptr)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#ab6ad661dbc7d9699747b0ec4f268c92c',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::index_store(uint32_t idx, uint8_t *ptr)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#a53ce8d22f3e5051594ff8799ede7167a',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::index_store(uint32_t idx, float4 *ptr)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#a3a736a75cd874d0a755c64bc2d5dbf36',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::index_store(uint32_t idx, float2 *ptr)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#ab6ad661dbc7d9699747b0ec4f268c92c',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::index_store(uint32_t idx, uint8_t *ptr)']]], + ['index_5fweighted_5fstore_9',['index_weighted_store',['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#a8063756b0e7d2c067a4f7ec2c8f117c1',1,'fbgemm_gpu::Vec4StepT< STEP, float >::index_weighted_store(uint32_t idx, float4 *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#a76e58bf5fe9b795864d627ba6748d7d7',1,'fbgemm_gpu::Vec4StepT< STEP, float >::index_weighted_store(uint32_t idx, float2 *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#aae4a2ca3b742838cf705dcfd6b62b9ad',1,'fbgemm_gpu::Vec4StepT< STEP, float >::index_weighted_store(uint32_t idx, uint8_t *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#a8063756b0e7d2c067a4f7ec2c8f117c1',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::index_weighted_store(uint32_t idx, float4 *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#a76e58bf5fe9b795864d627ba6748d7d7',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::index_weighted_store(uint32_t idx, float2 *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#aae4a2ca3b742838cf705dcfd6b62b9ad',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::index_weighted_store(uint32_t idx, uint8_t *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#a8063756b0e7d2c067a4f7ec2c8f117c1',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::index_weighted_store(uint32_t idx, float4 *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#a76e58bf5fe9b795864d627ba6748d7d7',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::index_weighted_store(uint32_t idx, float2 *ptr, const float weight)'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#aae4a2ca3b742838cf705dcfd6b62b9ad',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::index_weighted_store(uint32_t idx, uint8_t *ptr, const float weight)']]], + ['initializer_10',['Initializer',['../classssd_1_1_initializer.html#af5e246dd12f1a6c4e06ab77a41bd0590',1,'ssd::Initializer']]], + ['int_5fnbit_5fsplit_5fembedding_5fcodegen_5fforward_5funweighted_5fcpu_11',['int_nbit_split_embedding_codegen_forward_unweighted_cpu',['../gen__embedding__forward__quantized__unweighted__codegen__cpu_8cpp.html#a718e1ac4e0fa56a96e666ee2d5a5c40a',1,'int_nbit_split_embedding_codegen_forward_unweighted_cpu(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, Tensor indices, Tensor offsets, int64_t pooling_mode, int64_t row_alignment, int64_t output_dtype, int64_t fp8_exponent_bits, int64_t fp8_exponent_bias): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp'],['../embedding__forward__quantized__host__cpu_8cpp.html#a718e1ac4e0fa56a96e666ee2d5a5c40a',1,'int_nbit_split_embedding_codegen_forward_unweighted_cpu(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, Tensor indices, Tensor offsets, int64_t pooling_mode, int64_t row_alignment, int64_t output_dtype, int64_t fp8_exponent_bits, int64_t fp8_exponent_bias): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp']]], + ['int_5fnbit_5fsplit_5fembedding_5fcodegen_5fforward_5funweighted_5fcuda_12',['int_nbit_split_embedding_codegen_forward_unweighted_cuda',['../gen__embedding__forward__quantized__split__nbit__host__unweighted__codegen__cuda_8cu.html#a9c3b5fb374c1ef95520bc4e30b66325e',1,'int_nbit_split_embedding_codegen_forward_unweighted_cuda(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, const int64_t total_D, const int64_t max_int2_D, const int64_t max_int4_D, const int64_t max_int8_D, const int64_t max_float16_D, const int64_t max_float32_D, Tensor indices, Tensor offsets, const int64_t pooling_mode, const int64_t row_alignment, const int64_t output_dtype, Tensor lxu_cache_weights, Tensor lxu_cache_locations, const int64_t max_float8_D, const int64_t fp8_exponent_bits, const int64_t fp8_exponent_bias): gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu'],['../embedding__forward__quantized__host_8cpp.html#a5a581a6131f9754699b4e5bb27b20ecb',1,'int_nbit_split_embedding_codegen_forward_unweighted_cuda(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t pooling_mode, int64_t row_alignment, int64_t output_dtype, Tensor lxu_cache_weights, Tensor lxu_cache_locations, int64_t max_float8_D, int64_t fp8_exponent_bits, int64_t fp8_exponent_bias): gen_embedding_forward_quantized_split_nbit_host_unweighted_codegen_cuda.cu']]], + ['int_5fnbit_5fsplit_5fembedding_5fcodegen_5fforward_5fweighted_5fcpu_13',['int_nbit_split_embedding_codegen_forward_weighted_cpu',['../gen__embedding__forward__quantized__weighted__codegen__cpu_8cpp.html#a5a1cc170a745f03faefac536cfcbf1e6',1,'int_nbit_split_embedding_codegen_forward_weighted_cpu(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, Tensor indices, Tensor offsets, int64_t pooling_mode, int64_t row_alignment, Tensor indice_weights, int64_t output_dtype, int64_t fp8_exponent_bits, int64_t fp8_exponent_bias): gen_embedding_forward_quantized_weighted_codegen_cpu.cpp'],['../embedding__forward__quantized__host__cpu_8cpp.html#a5a1cc170a745f03faefac536cfcbf1e6',1,'int_nbit_split_embedding_codegen_forward_weighted_cpu(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, Tensor indices, Tensor offsets, int64_t pooling_mode, int64_t row_alignment, Tensor indice_weights, int64_t output_dtype, int64_t fp8_exponent_bits, int64_t fp8_exponent_bias): gen_embedding_forward_quantized_weighted_codegen_cpu.cpp']]], + ['int_5fnbit_5fsplit_5fembedding_5fcodegen_5fforward_5fweighted_5fcuda_14',['int_nbit_split_embedding_codegen_forward_weighted_cuda',['../gen__embedding__forward__quantized__split__nbit__host__weighted__codegen__cuda_8cu.html#ae65cbb34f3d373fe3e12b7bb899c1b10',1,'int_nbit_split_embedding_codegen_forward_weighted_cuda(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, const int64_t total_D, const int64_t max_int2_D, const int64_t max_int4_D, const int64_t max_int8_D, const int64_t max_float16_D, const int64_t max_float32_D, Tensor indices, Tensor offsets, const int64_t pooling_mode, const int64_t row_alignment, Tensor indice_weights, const int64_t output_dtype, Tensor lxu_cache_weights, Tensor lxu_cache_locations, const int64_t max_float8_D, const int64_t fp8_exponent_bits, const int64_t fp8_exponent_bias): gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu'],['../embedding__forward__quantized__host_8cpp.html#a79655cba701e82021eefe7fe8cb72916',1,'int_nbit_split_embedding_codegen_forward_weighted_cuda(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, int64_t total_D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t pooling_mode, int64_t row_alignment, Tensor indice_weights, int64_t output_dtype, Tensor lxu_cache_weights, Tensor lxu_cache_locations, int64_t max_float8_D, int64_t fp8_exponent_bits, int64_t fp8_exponent_bias): gen_embedding_forward_quantized_split_nbit_host_weighted_codegen_cuda.cu']]], + ['int_5fnbit_5fsplit_5fembedding_5fcodegen_5flookup_5ffunction_15',['int_nbit_split_embedding_codegen_lookup_function',['../group__embedding-cuda.html#ga0749f1c6540189dd47b32a56858f82fb',1,'embedding_forward_quantized_host.cpp']]], + ['int_5fnbit_5fsplit_5fembedding_5fcodegen_5flookup_5ffunction_5fcpu_16',['int_nbit_split_embedding_codegen_lookup_function_cpu',['../group__embedding-cpu.html#gac115303550aa9af7c170baef63bcdb00',1,'embedding_forward_quantized_host_cpu.cpp']]], + ['int_5fnbit_5fsplit_5fembedding_5fnobag_5fcodegen_5fforward_5funweighted_5fcpu_17',['int_nbit_split_embedding_nobag_codegen_forward_unweighted_cpu',['../gen__embedding__forward__quantized__unweighted__codegen__cpu_8cpp.html#ab6ae7551f9cd9d5cdb845240887aeaa1',1,'int_nbit_split_embedding_nobag_codegen_forward_unweighted_cpu(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, const int64_t D, Tensor indices, Tensor offsets, int64_t pooling_mode, int64_t row_alignment, int64_t output_dtype, int64_t fp8_exponent_bits, int64_t fp8_exponent_bias): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp'],['../embedding__forward__quantized__host__cpu_8cpp.html#af3d9ee6fd394ec0055de7f2c2acfba3d',1,'int_nbit_split_embedding_nobag_codegen_forward_unweighted_cpu(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, int64_t D, Tensor indices, Tensor offsets, int64_t pooling_mode, int64_t row_alignment, int64_t output_dtype, int64_t fp8_exponent_bits, int64_t fp8_exponent_bias): gen_embedding_forward_quantized_unweighted_codegen_cpu.cpp']]], + ['int_5fnbit_5fsplit_5fembedding_5fnobag_5fcodegen_5fforward_5funweighted_5fcuda_18',['int_nbit_split_embedding_nobag_codegen_forward_unweighted_cuda',['../gen__embedding__forward__quantized__split__nbit__host__unweighted__nobag__codegen__cuda_8cu.html#a9b168b9b2d002f86f7f16211b83fced0',1,'int_nbit_split_embedding_nobag_codegen_forward_unweighted_cuda(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, const int64_t D, const int64_t max_int2_D, const int64_t max_int4_D, const int64_t max_int8_D, const int64_t max_float16_D, const int64_t max_float32_D, Tensor indices, Tensor offsets, const int64_t row_alignment, const int64_t output_dtype, Tensor lxu_cache_weights, Tensor lxu_cache_locations, const int64_t max_float8_D, const int64_t fp8_exponent_bits, const int64_t fp8_exponent_bias): gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu'],['../embedding__forward__quantized__host_8cpp.html#a0545cdf708e09c0958f1538e7b4b29c9',1,'int_nbit_split_embedding_nobag_codegen_forward_unweighted_cuda(Tensor dev_weights, Tensor uvm_weights, Tensor weights_placements, Tensor weights_offsets, Tensor weights_tys, int64_t D, int64_t max_int2_D, int64_t max_int4_D, int64_t max_int8_D, int64_t max_float16_D, int64_t max_float32_D, Tensor indices, Tensor offsets, int64_t row_alignment, int64_t output_dtype, Tensor lxu_cache_weights, Tensor lxu_cache_locations, int64_t max_float8_D, int64_t fp8_exponent_bits, int64_t fp8_exponent_bias): gen_embedding_forward_quantized_split_nbit_host_unweighted_nobag_codegen_cuda.cu']]], + ['int_5fnbit_5fsplit_5fembedding_5fuvm_5fcaching_5fcodegen_5flookup_5ffunction_19',['int_nbit_split_embedding_uvm_caching_codegen_lookup_function',['../group__embedding-cuda.html#gabbe880100f1036a979f3a8d8755447d0',1,'embedding_forward_quantized_host.cpp']]], + ['int_5fnbit_5fsplit_5fembedding_5fuvm_5fcaching_5fcodegen_5flookup_5ffunction_5fcpu_20',['int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu',['../group__embedding-cpu.html#gaf5c83f0c03200546398764261403749d',1,'embedding_forward_quantized_host_cpu.cpp']]], + ['invert_5fpermute_5fcpu_21',['invert_permute_cpu',['../namespacefbgemm__gpu.html#aa79c3b125ba955f02e8ee2e70b1bbd32',1,'fbgemm_gpu']]], + ['is_5faligned_22',['is_aligned',['../namespacefbgemm__gpu.html#ae24b9318a63a9532f426abc0b0e94819',1,'fbgemm_gpu']]], + ['is_5fuvm_5ftensor_23',['is_uvm_tensor',['../group__cumem-utils.html#gacba28ed334d071e79c1ead1792391e9d',1,'fbgemm_gpu']]] ]; diff --git a/search/functions_a.js b/search/functions_a.js new file mode 100644 index 000000000..1f3551116 --- /dev/null +++ b/search/functions_a.js @@ -0,0 +1,70 @@ +var searchData= +[ + ['jagged_5f1d_5fto_5fdense_0',['jagged_1d_to_dense',['../group__jagged-tensor-ops-cpu.html#ga93b5edf03f38d8eaf9a0f1ece0bc1af7',1,'fbgemm_gpu']]], + ['jagged_5f1d_5fto_5fdense_5fmeta_1',['jagged_1d_to_dense_meta',['../namespacefbgemm__gpu.html#afdde1bd5a99cc5bcdfaf27b4c42cad7b',1,'fbgemm_gpu']]], + ['jagged_5f2d_5fto_5fdense_2',['jagged_2d_to_dense',['../group__jagged-tensor-ops-cpu.html#gaaa301b81a22a3d823ba5e65828093113',1,'fbgemm_gpu']]], + ['jagged_5f2d_5fto_5fdense_5fforward_5fcpu_3',['jagged_2d_to_dense_forward_cpu',['../namespacefbgemm__gpu.html#a70d2cdc82d96c9c4298b57133393a800',1,'fbgemm_gpu']]], + ['jagged_5f2d_5fto_5fdense_5fgpu_5fbackward_4',['jagged_2d_to_dense_gpu_backward',['../namespacefbgemm__gpu.html#a7c104248a9abcdcdac6bdcac571930a4',1,'fbgemm_gpu']]], + ['jagged_5f2d_5fto_5fdense_5fgpu_5fforward_5',['jagged_2d_to_dense_gpu_forward',['../namespacefbgemm__gpu.html#a56c28427858ea272148bdbfb9f373191',1,'fbgemm_gpu']]], + ['jagged_5f2d_5fto_5fdense_5fmeta_6',['jagged_2d_to_dense_meta',['../namespacefbgemm__gpu.html#a67b19e389f869540bd35510d4e8e7908',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fbmm_7',['jagged_dense_bmm',['../namespacefbgemm__gpu.html#aed181c3885f392fec8c38cdf10266d68',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fbmm_5fforward_8',['jagged_dense_bmm_forward',['../namespacefbgemm__gpu.html#a3eec1622180be9b7a31891d5e9f2ba71',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fbmm_5fforward_5fcuda_9',['jagged_dense_bmm_forward_cuda',['../namespacefbgemm__gpu.html#a4961acd2615018dff4fdf1390158f0a4',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fbmm_5fforward_5fmeta_10',['jagged_dense_bmm_forward_meta',['../namespacefbgemm__gpu.html#a022cdaaee01f619cf0cb7b29d80cbc65',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fbmm_5fkernel_11',['jagged_dense_bmm_kernel',['../namespacefbgemm__gpu.html#a6c32f4b4ccfdef9cf63d463cb235ec38',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fdense_5felementwise_5fadd_5fjagged_5foutput_12',['jagged_dense_dense_elementwise_add_jagged_output',['../namespacefbgemm__gpu.html#a47e4d714a08316066470d979f97f1d81',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fdense_5felementwise_5fadd_5fjagged_5foutput_5fforward_13',['jagged_dense_dense_elementwise_add_jagged_output_forward',['../namespacefbgemm__gpu.html#a10611541bdce9c65bfe48a01474d1725',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fdense_5felementwise_5fadd_5fjagged_5foutput_5fforward_5fmeta_14',['jagged_dense_dense_elementwise_add_jagged_output_forward_meta',['../namespacefbgemm__gpu.html#a56cac54ea3d7672c629010018ba59568',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fdense_5felementwise_5fadd_5fjagged_5foutput_5fmeta_15',['jagged_dense_dense_elementwise_add_jagged_output_meta',['../namespacefbgemm__gpu.html#ab421ce372347f826b7e7ff9e35f26c93',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fdense_5felementwise_5fjagged_5foutput_5f_16',['jagged_dense_dense_elementwise_jagged_output_',['../namespacefbgemm__gpu.html#a319b3f5f33bec0aff79f0ee990483f3d',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fdense_5felementwise_5fjagged_5foutput_5fmatches_5fopt_17',['jagged_dense_dense_elementwise_jagged_output_matches_opt',['../namespacefbgemm__gpu.html#adfb04060c9eecdadcf59b3c15d5bca08',1,'fbgemm_gpu']]], + ['jagged_5fdense_5fdense_5felementwise_5fjagged_5foutput_5fopt_5f_18',['jagged_dense_dense_elementwise_jagged_output_opt_',['../namespacefbgemm__gpu.html#aac40d60c62b0d176a962cdad964e34f6',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fadd_19',['jagged_dense_elementwise_add',['../group__jagged-tensor-ops-cpu.html#gaa797caaa08c70857433ae987d9cf30d7',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fadd_5fjagged_5foutput_20',['jagged_dense_elementwise_add_jagged_output',['../group__jagged-tensor-ops-cpu.html#ga1290f40c3ba39837dd009c3006353d7c',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fadd_5fjagged_5foutput_5fcuda_21',['jagged_dense_elementwise_add_jagged_output_cuda',['../group__jagged-tensor-ops-cuda.html#gad34ac20d2c9be5a6489c8e8befff7938',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fadd_5fjagged_5foutput_5fmeta_22',['jagged_dense_elementwise_add_jagged_output_meta',['../namespacefbgemm__gpu.html#a16d84a11c2e32cb0064721354fb190b7',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fadd_5fmeta_23',['jagged_dense_elementwise_add_meta',['../namespacefbgemm__gpu.html#aff88b44d096bd7a039dca72a5855198c',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fjagged_5foutput_5f_24',['jagged_dense_elementwise_jagged_output_',['../namespacefbgemm__gpu.html#a124d128a82ffb0342ce597d0325060fb',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fjagged_5foutput_5fopt_5f_25',['jagged_dense_elementwise_jagged_output_opt_',['../namespacefbgemm__gpu.html#aded7d8ce8ffbcce568c498fb32a7d071',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fmul_26',['jagged_dense_elementwise_mul',['../group__jagged-tensor-ops-cpu.html#ga5521ad46f5bab0d77c8bb036742f455d',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fmul_5fbackward_27',['jagged_dense_elementwise_mul_backward',['../namespacefbgemm__gpu.html#a6de8f2f64f7d90ab1997df02470a9564',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fmul_5fbackward_5fmeta_28',['jagged_dense_elementwise_mul_backward_meta',['../namespacefbgemm__gpu.html#abfbf6c239d283084ed1c68f18ea24af5',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fmul_5fforward_29',['jagged_dense_elementwise_mul_forward',['../namespacefbgemm__gpu.html#aaa297ab58f55125d7eb7b040cc4c254b',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fmul_5fforward_5fmeta_30',['jagged_dense_elementwise_mul_forward_meta',['../namespacefbgemm__gpu.html#ac30cb8e7e035c24bf4f6ac15bf1b623a',1,'fbgemm_gpu']]], + ['jagged_5fdense_5felementwise_5fmul_5fmeta_31',['jagged_dense_elementwise_mul_meta',['../namespacefbgemm__gpu.html#aaeeacda7f3587bfe9bf2ecf376dd635e',1,'fbgemm_gpu']]], + ['jagged_5fhash_5fsize_5fcumsum_5fcuda_32',['jagged_hash_size_cumsum_cuda',['../namespacefbgemm__gpu.html#aabd8b530d0ac7e5cb96cf19c7eb517e9',1,'fbgemm_gpu']]], + ['jagged_5findex_5fadd_5f2d_5fforward_5fcpu_33',['jagged_index_add_2d_forward_cpu',['../namespacefbgemm__gpu.html#af80524a7d454f6db1c478808e8a659a6',1,'fbgemm_gpu']]], + ['jagged_5findex_5fadd_5f2d_5fforward_5fcuda_34',['jagged_index_add_2d_forward_cuda',['../namespacefbgemm__gpu.html#a53a6da74de342260dcb15c68e9bddfd6',1,'fbgemm_gpu']]], + ['jagged_5findex_5fadd_5f2d_5fforward_5fv2_5fimpl_35',['jagged_index_add_2d_forward_v2_impl',['../namespacefbgemm__gpu.html#a8e1ed94256304ab16b948117d5315ee2',1,'fbgemm_gpu']]], + ['jagged_5findex_5fadd_5f2d_5fkernel_36',['jagged_index_add_2d_kernel',['../namespacefbgemm__gpu.html#ab571c6d5519c86bddfe58835c8209a4c',1,'fbgemm_gpu']]], + ['jagged_5findex_5fselect_5f2d_37',['jagged_index_select_2d',['../namespacefbgemm__gpu.html#aca95193cb0cc3db7030f18cb59c6cc33',1,'fbgemm_gpu']]], + ['jagged_5findex_5fselect_5f2d_5fforward_5fcpu_38',['jagged_index_select_2d_forward_cpu',['../namespacefbgemm__gpu.html#a71a54a14d90862afc8e5fe03e0c9ed8f',1,'fbgemm_gpu']]], + ['jagged_5findex_5fselect_5f2d_5fforward_5fcuda_39',['jagged_index_select_2d_forward_cuda',['../namespacefbgemm__gpu.html#acb5a744fbd29c8a3a25621c2850686c1',1,'fbgemm_gpu']]], + ['jagged_5findex_5fselect_5f2d_5fforward_5fv2_5fimpl_40',['jagged_index_select_2d_forward_v2_impl',['../namespacefbgemm__gpu.html#acd9af0fd221ab3fc330ca9f278433a3f',1,'fbgemm_gpu']]], + ['jagged_5findex_5fselect_5f2d_5fkernel_41',['jagged_index_select_2d_kernel',['../namespacefbgemm__gpu.html#ab1228b502a424869c5a7353f9fe52316',1,'fbgemm_gpu']]], + ['jagged_5fjagged_5fbmm_42',['jagged_jagged_bmm',['../namespacefbgemm__gpu.html#ae94c97196a7c392695b64f0db906ff4c',1,'fbgemm_gpu']]], + ['jagged_5fjagged_5fbmm_5fforward_43',['jagged_jagged_bmm_forward',['../namespacefbgemm__gpu.html#a5b01fcfb83764115f38eeab21c28a6a3',1,'fbgemm_gpu']]], + ['jagged_5fjagged_5fbmm_5fforward_5fcuda_44',['jagged_jagged_bmm_forward_cuda',['../namespacefbgemm__gpu.html#a0793a1a7b328d1351b6036d0be6a9c3d',1,'fbgemm_gpu']]], + ['jagged_5fjagged_5fbmm_5fforward_5fmeta_45',['jagged_jagged_bmm_forward_meta',['../namespacefbgemm__gpu.html#a2722fce931f20d923aba071236be4c87',1,'fbgemm_gpu']]], + ['jagged_5fjagged_5fbmm_5fkernel_46',['jagged_jagged_bmm_kernel',['../namespacefbgemm__gpu.html#a33c7044a13254607610928c6825738b1',1,'fbgemm_gpu']]], + ['jagged_5fjagged_5felementwise_5fdense_5foutput_5f_47',['jagged_jagged_elementwise_dense_output_',['../namespacefbgemm__gpu.html#a8fa5d329cfcc18c3304ba018919004ff',1,'fbgemm_gpu']]], + ['jagged_5fslice_48',['jagged_slice',['../namespacefbgemm__gpu.html#ab17aab73b431292434fd0d642a538960',1,'fbgemm_gpu']]], + ['jagged_5fslice_5fforward_5fcpu_49',['jagged_slice_forward_cpu',['../namespacefbgemm__gpu.html#a4e6521d00a6f81ad8ad7f7d38eef1aea',1,'fbgemm_gpu']]], + ['jagged_5fslice_5fforward_5fcpu_5fkernel_50',['jagged_slice_forward_cpu_kernel',['../namespacefbgemm__gpu.html#a284b652fdac146671fc324ac57d2ad5d',1,'fbgemm_gpu']]], + ['jagged_5fsoftmax_51',['jagged_softmax',['../namespacefbgemm__gpu.html#a069ed261b53e7051b85f3e572cad7f7e',1,'fbgemm_gpu']]], + ['jagged_5fsoftmax_5fbackward_52',['jagged_softmax_backward',['../namespacefbgemm__gpu.html#a7ba518434a034920e1092bf6d73879fd',1,'fbgemm_gpu']]], + ['jagged_5fsoftmax_5fbackward_5fcuda_53',['jagged_softmax_backward_cuda',['../namespacefbgemm__gpu.html#a305d9969e73060e49580aab1456ceb35',1,'fbgemm_gpu']]], + ['jagged_5fsoftmax_5fbackward_5fkernel_54',['jagged_softmax_backward_kernel',['../namespacefbgemm__gpu.html#a7101ddaed8357d824a9eeeaff67e5c4c',1,'fbgemm_gpu']]], + ['jagged_5fsoftmax_5fbackward_5fmeta_55',['jagged_softmax_backward_meta',['../namespacefbgemm__gpu.html#aad25e4e44afa7169c17e48d726ee0477',1,'fbgemm_gpu']]], + ['jagged_5fsoftmax_5fforward_56',['jagged_softmax_forward',['../namespacefbgemm__gpu.html#a023a8d9db48d27efcd2e77ede6366f5d',1,'fbgemm_gpu']]], + ['jagged_5fsoftmax_5fforward_5fcuda_57',['jagged_softmax_forward_cuda',['../namespacefbgemm__gpu.html#ab117510dd56fd42f3d774d22633b107f',1,'fbgemm_gpu']]], + ['jagged_5fsoftmax_5fforward_5fmeta_58',['jagged_softmax_forward_meta',['../namespacefbgemm__gpu.html#ac14e78d89697f34bcaa7c0a725c8a04a',1,'fbgemm_gpu']]], + ['jagged_5fsoftmax_5fkernel_59',['jagged_softmax_kernel',['../namespacefbgemm__gpu.html#a20e3d96daba045e321717b025f4124cc',1,'fbgemm_gpu']]], + ['jagged_5fto_5fpadded_5fdense_60',['jagged_to_padded_dense',['../group__jagged-tensor-ops-cpu.html#ga6d19e2c055144e4fe59b06999be34670',1,'fbgemm_gpu']]], + ['jagged_5fto_5fpadded_5fdense_5fbackward_61',['jagged_to_padded_dense_backward',['../namespacefbgemm__gpu.html#a861454c4383e6a0869a6c007fc498eed',1,'fbgemm_gpu']]], + ['jagged_5fto_5fpadded_5fdense_5fbackward_5fmeta_62',['jagged_to_padded_dense_backward_meta',['../namespacefbgemm__gpu.html#a8663dcc9727a468507eb75a849ae5820',1,'fbgemm_gpu']]], + ['jagged_5fto_5fpadded_5fdense_5fforward_63',['jagged_to_padded_dense_forward',['../group__jagged-tensor-ops-cuda.html#gaffad7e38f6faf5f8365784fbf82a26f5',1,'fbgemm_gpu']]], + ['jagged_5fto_5fpadded_5fdense_5fforward_5fmeta_64',['jagged_to_padded_dense_forward_meta',['../namespacefbgemm__gpu.html#a4fc6df6df430f9f9a20d7fe9d88dd009',1,'fbgemm_gpu']]], + ['jagged_5fto_5fpadded_5fdense_5fmeta_65',['jagged_to_padded_dense_meta',['../namespacefbgemm__gpu.html#ae45c299345273bf31be20e4893f58c28',1,'fbgemm_gpu']]], + ['jagged_5funique_5findices_5fcuda_66',['jagged_unique_indices_cuda',['../namespacefbgemm__gpu.html#a006273b56cd5a2efd001ad71d801a551',1,'fbgemm_gpu']]] +]; diff --git a/search/functions_b.js b/search/functions_b.js new file mode 100644 index 000000000..fca17ecac --- /dev/null +++ b/search/functions_b.js @@ -0,0 +1,7 @@ +var searchData= +[ + ['keyed_5fjagged_5findex_5fadd_5fdim1_5fkernel_0',['keyed_jagged_index_add_dim1_kernel',['../namespacefbgemm__gpu.html#a7d13c6946f45ae31d20aaecbd2316fec',1,'fbgemm_gpu']]], + ['keyed_5fjagged_5findex_5fselect_5fdim1_5fkernel_1',['keyed_jagged_index_select_dim1_kernel',['../namespacefbgemm__gpu.html#a0a518ef8f85868c32ac832576f8504d9',1,'fbgemm_gpu']]], + ['keyed_5fjagged_5findex_5fselect_5fdim_5f1_5fgpu_2',['keyed_jagged_index_select_dim_1_gpu',['../namespacefbgemm__gpu.html#a50a64d97045199097d3ff83edaf56a1a',1,'fbgemm_gpu']]], + ['kwarpsize_3',['kWarpSize',['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#a5cb5e51b17eeacd9818bc06b9eb55ddd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#a5bf3f753d62805ba481f4394edfa3158',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#a6e814e4e84507c4c3d932abf55dc8b86',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#a1df9e821214c938534c26d9ad87c1cff',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#a6c1937cacb2c930220dfb75c2ad2fdb4',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#a375f1380c0a43779a6521f855f7c90ef',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#ad74db204c21ce57463de29efd2b51c22',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__cta_8cu.html#a5f6257aba106ad398e4b4a75471a8642',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_cta.cu'],['../gen__batch__index__select__dim0__backward__kernel__warp_8cu.html#ad8b31de2b716f254b2d55b709a332afa',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_warp.cu'],['../gen__batch__index__select__dim0__backward__kernel__warp_8cu.html#a4c8628eff4245612b72787529fa2588f',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_warp.cu'],['../gen__batch__index__select__dim0__backward__kernel__warp_8cu.html#ae0f0975698d817274d5b21d1dd31285c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_warp.cu'],['../gen__batch__index__select__dim0__backward__kernel__warp_8cu.html#a2166d1c956baff37ca5f2aa75dd5d29e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const at::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > grad_offsets, const bool permute_output_dim_0_1): gen_batch_index_select_dim0_backward_kernel_warp.cu'],['../gen__batch__index__select__dim0__forward__kernel_8cu.html#a3bf7d511b93dad425030c52ff0b35378',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > total_L_offsets, const int32_t fixed_L_per_warp, const bool permute_output_dim_0_1, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > output): gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__batch__index__select__dim0__forward__kernel_8cu.html#a33f0706d826f38b6f36f4657f5a4bbbd',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > total_L_offsets, const int32_t fixed_L_per_warp, const bool permute_output_dim_0_1, pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > output): gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__batch__index__select__dim0__forward__kernel_8cu.html#ae5ffff834bcf0d76a398a76c06a9d01b',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > total_L_offsets, const int32_t fixed_L_per_warp, const bool permute_output_dim_0_1, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > output): gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__batch__index__select__dim0__forward__kernel_8cu.html#ab824e6081e4272e9f56dd57114a11d1d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > total_L_offsets, const int32_t fixed_L_per_warp, const bool permute_output_dim_0_1, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > output): gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__batch__index__select__dim0__forward__kernel_8cu.html#a0157d8084d739723c62bc11e05187901',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > total_L_offsets, const int32_t fixed_L_per_warp, const bool permute_output_dim_0_1, pta::PackedTensorAccessor64< at::Half, 1, at::RestrictPtrTraits > output): gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__batch__index__select__dim0__forward__kernel_8cu.html#a5732b42f4e3be21733885ce73871b37a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > total_L_offsets, const int32_t fixed_L_per_warp, const bool permute_output_dim_0_1, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > output): gen_batch_index_select_dim0_forward_kernel.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#a390d0e97c72c325e3497aeaa3226d527',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#a64b75e41b7d50f479b37a8c9cf0c1bcd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#a446498d5289ca85dd627faffea758f45',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#ad471b8ae6cce12a41ac160db1243f289',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#ae31ad4c12fc469e5ea516f04f158b98b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#a445aa60d61ffd3755914ffcf55c1a6fa',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#a91f7f08a7ae090f72ea7236ba0fb5c96',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__cta_8cu.html#a447c3f2918447f50e234bb7c3e2b1532',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#a03451f7ef0e82d0861c795948f00bf9b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#ad19427e173ef6c061d7a98427d69a595',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#a067846db797129cc6a85a87a6009c288',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#acab5c90a244916d389e9273df81384ab',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#a8c2c7cc342e76ed32a9621fd6bc6753e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#a6da3d4d33386cf358b201f5a9a2602bb',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#a59dab5f4ff3072665da93792aa3f85e9',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__kernel__warp_8cu.html#ae39679f36fe6a0b7b8846c79f69f4bf9',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a74aa12547ff3a9b9787bcdffe7b95e71',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a5e2c26bd8a7744de11021a9356b59a74',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a98c8243c5834d18ba31ffd8f3a570480',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a9ace33692ea18b9bd6c92308133c4499',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#ad9f02bfae155a2b4114e80ed9ef6390c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a225fbb17a5d73ae68945ddba0baf3960',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#ac93d19a97b3d9f1b1ae742787b03d5ba',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a78f61ea01f92fc50b78d776edda5691c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a2f1fea77b7579d1cab96be89c027396a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a5ebb0d91afb08ea0721308c278b18b89',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a2f21c580a600ad4f25aa58bbcae83e5b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a042eb088419228e49b76f7923732ed0c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#acac50d0765417aed0ba2275ef09e7363',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#ab528e6c3e784b1648ebe89230f6f864c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#ad07738475ef95243e6a5d08e8e6096f1',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#ae4bb5bd4f1fe9caf6f7a1d3107a479ac',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#a28bf244596f3c3376a70af53e767ed7d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#a0386dbc79ea0aaacffbe7cf8cba78167',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#a88a5089ba98be8ad981c0d2fd5c74657',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#ac768cc0753ab5318bbe47835d4f9fa9d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#a557bbce544c0a0b3dd4036ec01b6df55',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#a802903738d859e74795111ec77fb0268',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#a8d14751fd1f29be0069e1a35e0f921e4',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__cta_8cu.html#a1773883a254e0ab07fb0313e41e997d8',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#a735bf953224cfed630501bf38342b07c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#a608b71f09301faa6ce5838495c9e8de1',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#a5860a2f37abc179f0358170ee6403905',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#a21576335b9047871158e90e2032e8912',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#a04bf660a884cfa9ce91901a66fd99f75',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#abf0dc6720193f4ab9a278a95c495572d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#a2f8395d5782bc2895b99dde1a0a5ca20',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adagrad__split__weighted__kernel__warp_8cu.html#a2bfd2c4264e14c4f64b737892c1f4f06',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate): gen_embedding_backward_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#a458b855930bbc15ecb8cd6980db76490',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#a3911285f507951daf865d22e1dc2d7e9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#aaca84bf78edcf873560f46ba711426c4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#aba8c5712b7a8fce9f51ee8108dcb79f2',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#a07612c8115947993a4f0659814bdb991',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#aa98ce75bc9f2d7c2e1cc4436470c150e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#ab520b5026f77d9694c578169268d8f2b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__cta_8cu.html#a1314b4ae40316edeea56f92f7e28410c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#a94054b18dcd5508cb296f050eafaac8c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#abb6922c94e0bc8151481e453e7fbd2f7',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#a3c116db6b09393487355778e5d0ba3e0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#a4f79732380b8f26101bbb5a5877b0d97',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#a9ceb5776015ed4c35b0dabca7fa8f4c5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#afe4fa4f0b7eca5152a57e65d0310bc97',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#a65bd36be5843d363a2eb37a79abc423e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__kernel__warp_8cu.html#a0ecd7c3b11cae2bd14c04414fdf39d43',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#aa277c49633d92fd3ea4687ea0f01803e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#a7898e52d82e5ed49f5b81644674cccf6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#a54c18b3c9a1558b1f501088330c13c50',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#a2f12331e96d80708241cc08cea4b1fcf',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#ae682ad60acd60875e5499ef3ba62ba8d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#a7268248be04d72669a01dec69dc41c6a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#a97e63874df3289ce3294d46e2e016b05',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__cta_8cu.html#ad8cd9718877e1b127bdbe2690289a634',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#af2cf38bc095adda2d396c87d8abcc41e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#a12157bf0f49e84150a01fe1696cd2517',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#ae901e5d211562a991d8626c0336b0d91',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#a384c7aad1eb1b9b6e688ece904ad37e8',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#af09531afc63fe34068a117835f5276e2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#a0c52e587496d1304d86d780ab48907bf',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#a96d20178b145f86f646dd54cc65a4689',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__unweighted__nobag__kernel__warp_8cu.html#ab25d29756405f0c6cd77f9374cbc4eb7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#a4592aa63ba08715f737b78de44450545',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#ae67b8281998dc6618d7137d6c900514d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#ae4d004ac86d256e60d311e9968760ace',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#a2f2f9af58b42f9000c6afc0ede01f437',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#a3fd0b06c245d1deda1dfd409ef777dbc',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#a3e4660a5830af64e9d350bb97c1e3a33',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#affb72e0053cfe9211f9e16b0cfadc0ac',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__cta_8cu.html#ad6e87f8f718d28dac18c176645cc0177',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#ae3f634c3e17354623fb175e7ef20d939',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#aed47ce83acf75979b426dc241ae12149',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#a899e0ed06ca2d908cf92842a6c8145f1',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#afe83f8be103b8fff8e2ef9d56910ff68',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#a0b140fe99d998657ba70d37cb96981ae',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#a0f22910d204e8c3b3e5ff55c9480a2e0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#a670bd91b158c44cc933ee13f4083d850',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__adam__split__weighted__kernel__warp_8cu.html#a8fcac5f4fe8809ed79e52dd0b6cd3b33',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a98043b075d1f73a69bd0b19b1a24283e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a188fb685cd69453ab94f992332f523a9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a025f08f037ddf498278c429e09fd4d4a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a35f6a98383bf1ed951023b1fe432ed4c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a336a999e1b383c51b25841fa00f768d4',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a745a7f66bb6899e5071ee55e90f23368',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#af9137cfc1d9e0421323b78bf589c34fc',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a440eee4271eb5f61b204de4ec66054d2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#adf1cf7a1807aab50d346ef163c534c1d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#acde8c89a937e31cb98aa026b261cfe23',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#ab8e910b2c682642ac61185d1b155c5eb',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#ad1d9234d02b6be2ab2bdc5f4a8dc5701',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#acce157d175e9e72545e8784647a38511',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#a377694b1c0ce71b8d0c56077a904f7d7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#a8d2430849bd51fc5ad283d1a300cabba',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#a8ecc1609ac62272a2c0f5a1e1cddbed5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#ae8c1bfed5b951970a40f4028998d21fd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a73eba662cabf7a9761d2cf5d195206f4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#ab903a35e3bd981f1436d46179b87ecb9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a4074249c4919e43d534eb0904fa4693b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#ac2f871ccb0e37e363f7b979d923f944d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#ad39229402610f8c9069ea8a7e1c6a0ab',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a01c0225eea92b7b0403572335b1abc61',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a3490c2bf081c92095011640fb03961b5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#af95c4def12e4117e2d7bdc89b8fb0506',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a3f1c2f2aebc7a13ddade48d2a2f0301c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a023ead14754421961a4b473a3b1bb81c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a5ff2b2e15a95a8d176f99a8eebddf45d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a97cc1b7bcf350b322be5238011334085',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#aebd1c348edc2accec933a20abbf4ff2a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#af13fd6356fec61b096f429f666c4d50a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a8ae09f234561f1e415ef920bbf6eba22',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a6c1e5c2776f4209766c769243bf57894',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a96ca79bd9787eab9dfe57a09f61590db',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#af8966c1a682b91a466caa300f057d2cd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#aea22f0f456a89d61d1a066e7b363f59a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a78184f7b8b96c9fc9daa6d61c6bf8b32',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a933ef9f4d58e4ecad71988cd6f5ad537',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#af15cb1c5b6cddd5d3678e3cee0a6cefe',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a11b62696a1fcc6753a62e4b7b78987a9',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a58cc18641eaeee8eb587cb2a3726e85b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#ad7d432c589db7e87949a9d0ca5533b54',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a98fc1738f166a55809b2648796416db0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a0cb98e4afaf555388869ebe3242fc7d0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a2bcc4982507c7169f085b06d8bda77eb',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#ac58c7e73b10a41dc9f49d4e477b20fb2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#ac7cbe79ba3521a4bbd4c14a74fd6adff',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__approx__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a0e895892d276833086475c0e7f1b7927',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_approx_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#a1078e271f687367cefee7d0e75efe3d6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#acdc78be52effcf8cc2c910b822c3ee7a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#af8601ce12308ef84b4899504296ae6ce',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#a8b3df46fe1527fa468b07f9b7629420e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#a7fda08a8c83a3557857418ea43e4dea3',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#aa0685df0fb0a672d3d2237bd536db1b3',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#ac806eae9eee01106ea55ef146007dfe8',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__cta_8cu.html#a06567b685179fac57c60d07bfc5596f9',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__warp_8cu.html#a8c8e7afc35b5a54e69b3826c35adf2de',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__warp_8cu.html#a621cee00cffc059f6e5dac1dae6c870a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__warp_8cu.html#a9cf51444fe766e08d86ec3b884680083',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__kernel__warp_8cu.html#a1f4c065ae0c477c9055f201ff1d77eeb',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#a2431e3a9f193cb26104acb7111bb16e7',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#ac73727b32e66331f5cfe9705c2bcf9da',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#ab17cf37109f61a98a1e67e278282d410',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#a56eec79755c7e031dac93d7fee216fcb',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#a9e80797bba1bde61de4e23580a123045',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#a2309228f9f01e4fcfd7620b415458f5a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#a1edce0d6c349a03501ea2777a101af79',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__cta_8cu.html#a095215da51de608e36ba8292e72c72af',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__warp_8cu.html#ab671eaaed996f9f41eca1f557abae645',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__warp_8cu.html#a508ae4c79692f2664971272e30d3fc2e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__warp_8cu.html#a1ed3a6b528acb610a62f188de95ebc0d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__dense__split__unweighted__nobag__kernel__warp_8cu.html#a4fb277896c516d3421f917fbbbbc10e0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, float unused): gen_embedding_backward_dense_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#ac34e39eb121e260238cc9a54f2d13a85',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#a761283af416d74a3f610cb64f134cbad',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#ab2bf567d2b9120f65832f9e8e227c3bb',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#af798849724e5b343ef0987b64245c41a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#af04ea470cdd8a07f331e1efbc90114d2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#ae6c1506493e5d8b4539080b206713dce',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#a0beafd356bf1cfb6ea68ff7e1bd2992b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__cta_8cu.html#a356f3f696dd24ffa3fcf741fd8cd2ce7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float unused): gen_embedding_backward_dense_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#a36b2d055ae9089bfecce1598d5ee5734',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#a5765c206de6271ac6019a718fd7ad6b3',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#ae0f60eb17c2973c16120ac880fa1405c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#a33f0017811260350774433a6b81f85ea',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#a929b3395fb702cbf1354da769ca55637',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#a19a7ecd9eeedc4239cf1b987d3f4d15f',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#ab3795d6b83ee437c61880577c78b2273',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__dense__split__weighted__kernel__warp_8cu.html#afb2bcda34aa0401c61ef4fd5ebe0b090',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t info_B_num_bits, const uint32_t info_B_mask, float unused): gen_embedding_backward_dense_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#a458b855930bbc15ecb8cd6980db76490',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#a3911285f507951daf865d22e1dc2d7e9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#aaca84bf78edcf873560f46ba711426c4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#aba8c5712b7a8fce9f51ee8108dcb79f2',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#a025db262738d28e0f6d0073da9eecc1c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#aa98ce75bc9f2d7c2e1cc4436470c150e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#ab520b5026f77d9694c578169268d8f2b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__cta_8cu.html#a1314b4ae40316edeea56f92f7e28410c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#a94054b18dcd5508cb296f050eafaac8c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#abb6922c94e0bc8151481e453e7fbd2f7',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#a3c116db6b09393487355778e5d0ba3e0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#a4f79732380b8f26101bbb5a5877b0d97',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#ad312b70230d4098d8ac2747559c7f26d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#afe4fa4f0b7eca5152a57e65d0310bc97',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#a65bd36be5843d363a2eb37a79abc423e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__kernel__warp_8cu.html#a0ecd7c3b11cae2bd14c04414fdf39d43',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#aa277c49633d92fd3ea4687ea0f01803e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a7898e52d82e5ed49f5b81644674cccf6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a54c18b3c9a1558b1f501088330c13c50',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a2f12331e96d80708241cc08cea4b1fcf',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a0b27ae9a200a1ece5394819d34ccab40',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a7268248be04d72669a01dec69dc41c6a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a97e63874df3289ce3294d46e2e016b05',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__cta_8cu.html#ad8cd9718877e1b127bdbe2690289a634',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#af2cf38bc095adda2d396c87d8abcc41e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a12157bf0f49e84150a01fe1696cd2517',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#ae901e5d211562a991d8626c0336b0d91',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a384c7aad1eb1b9b6e688ece904ad37e8',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#ac9b1b580c02b691e732330917b4346b9',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a0c52e587496d1304d86d780ab48907bf',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a96d20178b145f86f646dd54cc65a4689',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__unweighted__nobag__kernel__warp_8cu.html#ab25d29756405f0c6cd77f9374cbc4eb7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#a4592aa63ba08715f737b78de44450545',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#ae67b8281998dc6618d7137d6c900514d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#ae4d004ac86d256e60d311e9968760ace',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#a2f2f9af58b42f9000c6afc0ede01f437',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#a234aa0426b89c62486c8f88fdd7722e8',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#a3e4660a5830af64e9d350bb97c1e3a33',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#affb72e0053cfe9211f9e16b0cfadc0ac',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__cta_8cu.html#ad6e87f8f718d28dac18c176645cc0177',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#ae3f634c3e17354623fb175e7ef20d939',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#aed47ce83acf75979b426dc241ae12149',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#a899e0ed06ca2d908cf92842a6c8145f1',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#afe83f8be103b8fff8e2ef9d56910ff68',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#afa3433936e5b727b1211effc7414d937',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#a0f22910d204e8c3b3e5ff55c9480a2e0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#a670bd91b158c44cc933ee13f4083d850',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lamb__split__weighted__kernel__warp_8cu.html#a8fcac5f4fe8809ed79e52dd0b6cd3b33',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#aea8e820a7a4bc3fe64bb6c818542a3aa',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#ad64afc5ea3a238f14048b1d678f617be',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#ac8f9bf44e289b97fd4b68b3ac86e5fa2',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#a3cf034bfeabf17e2c02ef5eff0e39d27',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#ae9bd7c9d24668aa08267f29bcc8d579b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#ac2f4c84a8e13733979d8c8eb160d8ab6',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#a9668523612cb73bdea52956fff1a645d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__cta_8cu.html#a36dbb2ed81d41998cd4ddd239f6e18ff',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#a47b1f486724dfb5ef0c59660725ba49e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#af72ccb394ff0a9c8bad2415b26124ee8',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#af37b23a1376bb72a6936967e93403d29',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#a839068bef278b0cf5184340361f2db61',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#af3b0b53e46e934265545fef179bc4a42',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#a3be6a3a67f391545b95c03cdfeabab49',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#a9e740e6e3df02da3c05d0dfd940a2793',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__kernel__warp_8cu.html#acff100b57110e4d629c786c3535bf208',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a2992f3c3797e58777a7f7d6aff063137',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a88eb41bb17cb58eaf37c6e5cc0ae1bfe',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#abab241cb0bbbdda5a3d240feff95de96',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a837a847bd0e24c4c323f60f3cd49bb93',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#ad25cc23c713ce4d2ce9a057d23d66b8c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#aa8a6cd9058bac3b6775b6057a8b0beb2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a28402ef2cf3a1b34fdadac6a6ef06adf',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a33595ad9426cb537c6e4e9c2bb0d1cfc',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a28fd1bfeb870e4192c831675880469bc',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#ab570073ba2f2dc988643433eb9ee56e3',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a2543bb59812617ba91ec36256ea579b5',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#ae0bf41c3a1fa62e4aacaf4dd6e3ba1ac',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a60837a52bb429e86372390ab093b5c3f',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a6a6e51dcbe4f354b395c5ef3a8632e9b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#afcd6a68c14971422bbcac044bc2e5fe7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a0e9aa9538f85f1a20881b99a619ff138',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#a021fc973b5fc3d624856c3095ea0d8c5',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#a1a126c0d3c9315985228744121d10f65',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#aba4fc1bf5159b001bdbeaed09bac28cf',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#a87b68faad6789ef38e5ee96bdf0adadb',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#abb70eda92401330f9c430e33657f5390',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#accc2086d06273c59409c74b598e17066',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#a331c0180e8dc65d864006a18ae10f3e7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__cta_8cu.html#a1da0aa99bcd3a3a2ad540eaba284bd08',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#a46d814dea7a25a249b9e0fc0c82023ed',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#a19120fdc3ff0a026755d36ddb40ff43b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#a1d111c3d803b0ed234aec8f5604bec87',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#a7f28593d442951ae04e27670c892fdc9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#a80acf7650ec2712ea7d51f7d5156fc39',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#a7643d87610f1fd256807566fcae51c36',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#ab9a28b117d8d2c802b31c3850cebf7ef',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__lars__sgd__split__weighted__kernel__warp_8cu.html#a53612aafa2641dc1c70fc11355c354c5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float learning_rate, float eta, float momentum, float weight_decay): gen_embedding_backward_lars_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#a459aacd66b48c479d5773c84d129086d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#aab8efedfe2eecb8e722290e8670b57be',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#aef14d493a157796b5d5b3708471dd5f9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#a1495ee920385d2c17517f402e4f2f1d3',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#a13fd2498aed38e9bc488bba7aed3c70c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#ae1896638d5d062dd4fdb76ea25fa25ad',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#aff46c2a59e01f53a86a7b0d79a618a13',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__cta_8cu.html#a0bf2b9756ea833bf245d6fd93a68bba2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__warp_8cu.html#a07fe51377b6ac8933fda5657824dfa00',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__warp_8cu.html#a0424cc55d1baf826ec4665dc699c0ee8',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__warp_8cu.html#abe7a518fe77140a9f84658b9be73ca57',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__kernel__warp_8cu.html#a93379bd0b52108c09ce0c6012c1a0bc4',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#a9efa56f919a034ad1c2eb4339babfacd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#af1b7ece649e9d0dbeb4a372364cfbf54',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#a5be30952d02614260f81e9b29d17f767',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#a6c887e7cd209eff2be60616a0eb2ea9f',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#aa6dac18027510aba99d797d8c340fa0c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#a1aa0bffadd8de61d9327613f1b0c3d8d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#a805ef69c6e5b5bbd4a5d70b053dc8940',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__cta_8cu.html#a67e2d754aeb8030c70dfdf94358cac76',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__warp_8cu.html#ab111a2635d39331e5dde581b2cf5ab40',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__warp_8cu.html#ac42deea1306a7165392cc02c0c962381',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__warp_8cu.html#a216facff7aab2092d3300f52f73f441c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__none__split__unweighted__nobag__kernel__warp_8cu.html#a9a65221171b1118ec811d883a600b7eb',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#a2dcf33b730969fab9d8d9e13f5812500',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#a1628e1fb812ec5d70a2a3701145ae3dd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#a3af1e6fa25253eb084459b3d13ebf58a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#a5731f2347a6fc9cfce399131b7535c72',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#a1916dad21c1174ed094bf7cb9990674f',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#abcac665cc8837bd07d64ee1f1d22c9bd',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#a1627d1331758cf0987f80b531597de96',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__cta_8cu.html#aa2f1fe9cdd926d486017e9c9e3ee401e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#a58dd95b539386ce0756417ffa7e3c675',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#a4ac5aa9e7a97b988f21d79f2c77e8a42',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#a6ad697b6cced262fbf9c5329af882295',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#a946e1b9e34decc6ef732c17c06eaf67b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#a99d2945d0e14c762a262971ad5cdddca',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#ad3382f93d63430516e0fa4ee3dfcd35b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#a88e9b91386946c328e4ea9cd1074af16',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__none__split__weighted__kernel__warp_8cu.html#a376fefbb04f4e4d081447881d6aa3ca7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > grad_dev_weights, const int32_t max_D, const int32_t info_B_num_bits, const uint32_t info_B_mask, int64_t total_hash_size, int64_t total_unique_indices): gen_embedding_backward_none_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#a458b855930bbc15ecb8cd6980db76490',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#a3911285f507951daf865d22e1dc2d7e9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#aaca84bf78edcf873560f46ba711426c4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#aba8c5712b7a8fce9f51ee8108dcb79f2',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#a11b57ed4691d1c773211ef5481a6dd02',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#aa98ce75bc9f2d7c2e1cc4436470c150e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#ab520b5026f77d9694c578169268d8f2b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__cta_8cu.html#a1314b4ae40316edeea56f92f7e28410c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#a94054b18dcd5508cb296f050eafaac8c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#abb6922c94e0bc8151481e453e7fbd2f7',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#a3c116db6b09393487355778e5d0ba3e0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#a4f79732380b8f26101bbb5a5877b0d97',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#a0e70fccd2a1e2a9e2135f0b38f7fb8b3',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#afe4fa4f0b7eca5152a57e65d0310bc97',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#a65bd36be5843d363a2eb37a79abc423e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__kernel__warp_8cu.html#a0ecd7c3b11cae2bd14c04414fdf39d43',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#aa277c49633d92fd3ea4687ea0f01803e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#a7898e52d82e5ed49f5b81644674cccf6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#a54c18b3c9a1558b1f501088330c13c50',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#a2f12331e96d80708241cc08cea4b1fcf',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#a2b831b47546fedc2c25d2ade8b88b756',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#a7268248be04d72669a01dec69dc41c6a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#a97e63874df3289ce3294d46e2e016b05',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__cta_8cu.html#ad8cd9718877e1b127bdbe2690289a634',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#af2cf38bc095adda2d396c87d8abcc41e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#a12157bf0f49e84150a01fe1696cd2517',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#ae901e5d211562a991d8626c0336b0d91',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#a384c7aad1eb1b9b6e688ece904ad37e8',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#a189144e6ce32a982c752160cfb103ec8',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#a0c52e587496d1304d86d780ab48907bf',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#a96d20178b145f86f646dd54cc65a4689',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__unweighted__nobag__kernel__warp_8cu.html#ab25d29756405f0c6cd77f9374cbc4eb7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#a4592aa63ba08715f737b78de44450545',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#ae67b8281998dc6618d7137d6c900514d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#ae4d004ac86d256e60d311e9968760ace',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#a2f2f9af58b42f9000c6afc0ede01f437',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#af70ed3aa3b3e9f4ef10054777ea73ab1',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#a3e4660a5830af64e9d350bb97c1e3a33',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#affb72e0053cfe9211f9e16b0cfadc0ac',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__cta_8cu.html#ad6e87f8f718d28dac18c176645cc0177',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#ae3f634c3e17354623fb175e7ef20d939',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#aed47ce83acf75979b426dc241ae12149',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#a899e0ed06ca2d908cf92842a6c8145f1',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#afe83f8be103b8fff8e2ef9d56910ff68',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#abb5bc6565be4b9b6cc47cb4ca0d02a12',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#a0f22910d204e8c3b3e5ff55c9480a2e0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#a670bd91b158c44cc933ee13f4083d850',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__adam__split__weighted__kernel__warp_8cu.html#a8fcac5f4fe8809ed79e52dd0b6cd3b33',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_adam_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#a458b855930bbc15ecb8cd6980db76490',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#a3911285f507951daf865d22e1dc2d7e9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#aaca84bf78edcf873560f46ba711426c4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#aba8c5712b7a8fce9f51ee8108dcb79f2',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#a1752a413ef2e5ee8694cbed313bd3c9b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#aa98ce75bc9f2d7c2e1cc4436470c150e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#ab520b5026f77d9694c578169268d8f2b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__cta_8cu.html#a1314b4ae40316edeea56f92f7e28410c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#a94054b18dcd5508cb296f050eafaac8c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#abb6922c94e0bc8151481e453e7fbd2f7',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#a3c116db6b09393487355778e5d0ba3e0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#a4f79732380b8f26101bbb5a5877b0d97',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#aafa7d80ed4b830a47066853afca5adb8',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#afe4fa4f0b7eca5152a57e65d0310bc97',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#a65bd36be5843d363a2eb37a79abc423e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__kernel__warp_8cu.html#a0ecd7c3b11cae2bd14c04414fdf39d43',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#aa277c49633d92fd3ea4687ea0f01803e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a7898e52d82e5ed49f5b81644674cccf6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a54c18b3c9a1558b1f501088330c13c50',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a2f12331e96d80708241cc08cea4b1fcf',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#ab0ee6537f36eac8a7a5af1623b9034a1',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a7268248be04d72669a01dec69dc41c6a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#a97e63874df3289ce3294d46e2e016b05',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__cta_8cu.html#ad8cd9718877e1b127bdbe2690289a634',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#af2cf38bc095adda2d396c87d8abcc41e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a12157bf0f49e84150a01fe1696cd2517',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#ae901e5d211562a991d8626c0336b0d91',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a384c7aad1eb1b9b6e688ece904ad37e8',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a12afc30313df2164ef2c299b47d3762f',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a0c52e587496d1304d86d780ab48907bf',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#a96d20178b145f86f646dd54cc65a4689',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__unweighted__nobag__kernel__warp_8cu.html#ab25d29756405f0c6cd77f9374cbc4eb7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#a4592aa63ba08715f737b78de44450545',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#ae67b8281998dc6618d7137d6c900514d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#ae4d004ac86d256e60d311e9968760ace',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#a2f2f9af58b42f9000c6afc0ede01f437',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#ab41d9b72247799b42c181dc59e842a2f',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#a3e4660a5830af64e9d350bb97c1e3a33',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#affb72e0053cfe9211f9e16b0cfadc0ac',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__cta_8cu.html#ad6e87f8f718d28dac18c176645cc0177',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#ae3f634c3e17354623fb175e7ef20d939',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#aed47ce83acf75979b426dc241ae12149',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#a899e0ed06ca2d908cf92842a6c8145f1',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#afe83f8be103b8fff8e2ef9d56910ff68',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#a2aa1026f9d94c927bfdc7d12f23f8626',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#a0f22910d204e8c3b3e5ff55c9480a2e0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#a670bd91b158c44cc933ee13f4083d850',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__partial__rowwise__lamb__split__weighted__kernel__warp_8cu.html#a8fcac5f4fe8809ed79e52dd0b6cd3b33',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum2_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum2_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum2_offsets, float learning_rate, float eps, float beta1, float beta2, float weight_decay, int64_t iter): gen_embedding_backward_partial_rowwise_lamb_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#a015143a1cf9641909ef5739492836ab9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#aac73098f12c44ace7bd0c6ed29d1acb5',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#adde42935e2ebd0c4cbfb5a925c603d3c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#ac9113b72a8883bfe52a840eaf6bf0bcc',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#a1a4b7c2d1fb4fe724a9ddcefe4a3ad96',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#a4d4d95ee827c360821c77e0f6a5b533c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#ad6b46d8dad6badf1a2e13000e0809359',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__cta_8cu.html#a69800c08002e6a964629da3691cfa699',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#ac0a2c283925ea0172a022b44ec4420ab',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#a6ec6ef39c438b48fc5ff99850376c2e2',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#a68d39c92f33a5fb23bf494df10381aa7',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#aa9a73b585d5c585477687c3b42859fbf',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#a502058fc25fa19bb0cd2e7cfa440c82f',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#a157d6cc11ad0a2f4127709df3181d056',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#af6c8d616d0e8c2d6738c38fece880943',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__kernel__warp_8cu.html#aa47b1b3531724ee008b8a88a913375d4',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a0b393ddcfa07501c936c09103420a327',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#ac99cf2df0002f1359da1a71821a5d7a6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a16c1dd81db1a38927c5a39968b2a2047',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#adea97673f55b5d43fb1091e7cb082cae',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#ac80ea89a8a915ac8a1a6eaee9bd3a921',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a4b2cdd16081fdd55ef997fcba11943b3',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a929c9944210d7078c0bcc89ae2ad2239',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a942425624762de23778b8ea3b8da1267',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a006b3b6fd358ff41f9dad5c39f2cb330',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a81cb91d9b5c6ba53ca66e62ad21265d6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#ac028b85ab4d730883ee7b170a11039da',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a73f35746d0a9bed1751b964c07d2c3b6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#ae5efcbb0aa7b60c29535ac9c49bbb00a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a3d28eecf8be5cfcbcd71fef6322ef6df',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a0e96ba84ab91aea304a2e6ac78eb1fa2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a96833312f1cb3bd4067a854dc1a85d9c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#a1ce149ce2e815f85f42f779e853b6384',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#a5e2d57ad35649098aac904f8acec4d7b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#ac8d5805872473e761a71634add6ae7b8',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#a873c28f5a06ab6135240b18b23aa17d5',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#a86ab9a70fb4459793418ac95f6844494',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#aa1d6f03c27aea0bad56e3d38003ffda0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#a0e3cae02bd4631c5b65507b91c500606',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__cta_8cu.html#a7929e1f87db6d3d72cae3804c1aafef0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#a0d5f1eb18d7aaf74fabc0d63a215062e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#ae319b79d484f9cfb10ddf935cf3dce8c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#a50b88aba0d96371aba370d9894857aff',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#a24d7b65f902789f50e1a0fdc3c72da0e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#aba27e610941b3c6a9520a14a567022dd',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#a2922d0a81b0f1a4427fde265b05427bb',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#a261934c69234b20a2f19650fa88e4cd0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__unweighted__vbe__kernel__warp_8cu.html#a60c7d08b38c83f34ba87438440f950e8',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#ae3f8662de26a86a0e1e1612804f49b52',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#a3530c5b60b4dc3bc1fd5f0af31e32361',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#ab4098b3e8ab8552ec947cbb52de77a0c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#abc6e9b570bfaac7771adbc13408463be',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#ab2bfbf20e506af2d9ee18af83b527e5e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#aaccb85f565c3b1d5a36dbf413fe05ec4',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#a799bc0e6eb13b05b038c910b7a650bd8',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__cta_8cu.html#ac2e2b47b2c51943f4ff8fabdfb57f270',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#a026a0932fcb72fcf66460486db323e9f',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#a99db511954f4e0ced515daf371cda8da',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#a6baf7387932e58b5a570e01ea0fb2638',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#a1f859731d1effb901df0012fbdf35756',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#a3451cc31def5c831a428e221f4713d57',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#a55aad527eb01f16edb9ec021704e4a60',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#a494688d7ae0362eb0e5aacbc0ecf19c7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__kernel__warp_8cu.html#a95227f34f2ab6c04dcaadd41e1886304',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#a62e5730bd70e4665352946a17b3fd18a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#a91027c49f28b9c30a8fe20c5ede43b4f',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#af003bb8591ecd9b6b755807f601cbde6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#af9fea1c8c674df3acb9e76cafe6518fe',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#a5f06095eeec3319c0936d2a99a095054',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#a30c5340455dedcd1684d0858738d7c9d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#ab753932a15b63161c3d38c683e2d290d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__cta_8cu.html#a874c752c07a36fb38f9476fe78a46735',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#ab2c641791d87abe8e19dcee2b3726819',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#a7e755382f2ce1290c7c3357ecc025b78',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#a30b2c51012735e4ebe919dba89c4d8cd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#a365c2eb2cec39bb504cdae18934b89c3',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#a004e2b2b3ffa5c4c402b2f56fee16ce6',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#ae3be8dde6dfb4fe3ba1a815b319a2925',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#ac5817730d59e634a76e7aafce41aaf26',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__split__weighted__vbe__kernel__warp_8cu.html#a93d410b588239e17ac8e10d7d6e291a0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode, float max_norm): gen_embedding_backward_rowwise_adagrad_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#a4172fb110abe23887cdaf0536ef2bcaa',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#a03929e871fb455cace7f23efc0d24583',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#a468a45d6ca5a19247698337fc33f435d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#a184065748160f0c7788467d39b27f5d1',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#a6a9022b14995bf97b8f204dc404e1e8d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#aea5128dbea65fac0ceb8b42749f74099',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#a8f1a8a90b130ae668e3b6b7947c6c4f5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__cta_8cu.html#a2e889d0595ab0362613d58e7ff8960b7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#a6014caa4aca0c9e7b583e71900a0a48c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#a90005cd7c4e9aae8498fd1d938983179',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#a384fb2660e3cb8a46cf1154d5b45bf2e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#a297213250dad534fbb5b3654e854f1a4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#a012a4e2ee1f52bb243e5388eec3e8a5c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#a0c3bd53d12b516a80478d5a9017a684b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#ab20ec4fe16b91aae91640b2dd5452ed5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__kernel__warp_8cu.html#a843389bf3c054d1a20a6115d47d99cf7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#ac79c384938b7bffef4943090b602ba5e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#aeac8eff5cada3efbb3674213a5f42bc9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#afbd549d3981439a47fb0c3811e9eacf4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#a5f6a21f619bb88465b760c5556fe6f1b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#ab03dcc766f91725239b7737cee2b194d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#a2b9bc69930f735395605b0b91203d7a5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#a6d5664cd6fa11c72a6de5f652e0aec5d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__cta_8cu.html#af412ff33330b1349cbf7c2a33e58f9a7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#abcba604787cbdb187f05ab27324d67f7',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#a33f05c8d5a2149e88f0c5a0a446357c2',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#ad387d04e602a3a29f7b44eaeb1edb9fa',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#a9cd29bb0dd406092916c5eb0605aaf0d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#a7b5add21eacc916018bb3b4e0fd96436',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#a5da06cf5b2fca41ca811bae68efd4049',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#a853a5012db3ca2150440460e10d486ae',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__unweighted__nobag__kernel__warp_8cu.html#af1c9033199b40adc628848b21f60b950',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#a0d5d4738a27dacbbecc699b0297a6331',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#a260b636a6d13f307a286c4b24b47a1cc',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#a05a2693fb5198654434e63ef4a07981e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#ad178df90f04b6ef9c3c907c699042d8e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#a04d2d84d9856aa9de1f36e1813d4c172',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#a0c158805e4537d8825326a3ecddf9c9c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#ac6a60f786cbc800c9b675f386c1014ab',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__cta_8cu.html#a91f984a560c40dcae1abbb2391fa2fda',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#a6c5b8de0acb5391f4dc4172ce5ca094e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#afb504ea4eac563c64b42343e986a7847',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#a3c2fb3ecac9e0bd458fbd1023025d5d5',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#a6dfe39e1df2bced46b2e0991e3435be9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#aed82b2485ec72bfc56b2fae686d062f0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#ac3a5c0e1adaae87917f2645e6a2afa46',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#a54694cb47dc38390f1b301aa039cb31d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__counter__split__weighted__kernel__warp_8cu.html#a9c2f7f4369735aa317a88c819b378f43',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > prev_iter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > prev_iter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > prev_iter_offsets, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > row_counter_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > row_counter_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_counter_offsets, float eps, float learning_rate, float weight_decay, int64_t iter, int64_t counter_halflife, int64_t adjustment_iter, float adjustment_ub, int64_t learning_rate_mode, int64_t weight_decay_mode, int64_t grad_sum_decay, float max_counter, float tail_id_threshold, int64_t is_tail_id_thresh_ratio, int64_t regularization_mode, float weight_norm_coefficient, float lower_bound): gen_embedding_backward_rowwise_adagrad_with_counter_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a98043b075d1f73a69bd0b19b1a24283e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a188fb685cd69453ab94f992332f523a9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a025f08f037ddf498278c429e09fd4d4a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a35f6a98383bf1ed951023b1fe432ed4c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#ae079dac6052edf65f8a39b4fd9de7c70',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a745a7f66bb6899e5071ee55e90f23368',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#af9137cfc1d9e0421323b78bf589c34fc',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__cta_8cu.html#a440eee4271eb5f61b204de4ec66054d2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#adf1cf7a1807aab50d346ef163c534c1d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#acde8c89a937e31cb98aa026b261cfe23',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#ab8e910b2c682642ac61185d1b155c5eb',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#ad1d9234d02b6be2ab2bdc5f4a8dc5701',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#ae5465342deb9e71765693c8929b5f475',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#a377694b1c0ce71b8d0c56077a904f7d7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#a8d2430849bd51fc5ad283d1a300cabba',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__kernel__warp_8cu.html#a8ecc1609ac62272a2c0f5a1e1cddbed5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#ae8c1bfed5b951970a40f4028998d21fd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a73eba662cabf7a9761d2cf5d195206f4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#ab903a35e3bd981f1436d46179b87ecb9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a4074249c4919e43d534eb0904fa4693b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a77fbe03e0ff353a2ebe490cf97f0c353',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#ad39229402610f8c9069ea8a7e1c6a0ab',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a01c0225eea92b7b0403572335b1abc61',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__cta_8cu.html#a3490c2bf081c92095011640fb03961b5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#af95c4def12e4117e2d7bdc89b8fb0506',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a3f1c2f2aebc7a13ddade48d2a2f0301c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a023ead14754421961a4b473a3b1bb81c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a5ff2b2e15a95a8d176f99a8eebddf45d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a4f470748a75cfc59c5c7a0cb577289f2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#aebd1c348edc2accec933a20abbf4ff2a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#af13fd6356fec61b096f429f666c4d50a',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__unweighted__nobag__kernel__warp_8cu.html#a8ae09f234561f1e415ef920bbf6eba22',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a6c1e5c2776f4209766c769243bf57894',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a96ca79bd9787eab9dfe57a09f61590db',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#af8966c1a682b91a466caa300f057d2cd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#aea22f0f456a89d61d1a066e7b363f59a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a22292529eb85249ba3bec7be758eebee',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a933ef9f4d58e4ecad71988cd6f5ad537',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#af15cb1c5b6cddd5d3678e3cee0a6cefe',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__cta_8cu.html#a11b62696a1fcc6753a62e4b7b78987a9',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a58cc18641eaeee8eb587cb2a3726e85b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#ad7d432c589db7e87949a9d0ca5533b54',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a98fc1738f166a55809b2648796416db0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a0cb98e4afaf555388869ebe3242fc7d0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#ac0e36eb9e678f52e0561366229ecc4bf',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#ac58c7e73b10a41dc9f49d4e477b20fb2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#ac7cbe79ba3521a4bbd4c14a74fd6adff',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__adagrad__with__weight__decay__split__weighted__kernel__warp_8cu.html#a0e895892d276833086475c0e7f1b7927',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t weight_decay_mode): gen_embedding_backward_rowwise_adagrad_with_weight_decay_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#affa3d280e56d69dbe39ea3bda0bcba6e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#a7f2d784a0f6604d457a71d725eca24ef',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#abd72df618308d6a739f91188cc5a1e91',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#a5db669968a840fd6cd68feb612d416de',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#a94192c3fad25107220bf7cf718abdfed',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#a0dadc1a7dd7578c22f5d239047bf7794',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#a48f4d0c7f7758b5149c9d96abb61354d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__cta_8cu.html#a173df29f55015b4b4d8c9cdda6986823',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#aed68dca4d92a97e556d3073cab88a18f',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#a23c47f9e7c8f8a011e9a2d3778e2a65b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#a856011203b19087ab6f1eebb7a8f18dc',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#a69682ffbf2a367fa7e6d25edd9cf1218',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#a12192a01089a95a93f5a384e9faaa312',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#a58f3e7232aae5283c177ee7305d1bede',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#a7412bb61fd123be30b935508b1839d66',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__kernel__warp_8cu.html#afd57c62802e581a57d2e9daa52b09e4b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a5d3923934afd4c41777f94dd36798bb8',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#ac0c8d3772833c3ef461a44cabc9cda70',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a5b2d60d4092d3af5e898446d1ffc3282',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#aed55f18c1fecec6d6de78577918449a0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a240624068305d411db3cdece269f6a2f',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a6854bab8c0d96882e4f9f980880531bc',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#a9ab47f5d78d1d005e9f8784e812589b0',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__cta_8cu.html#af7d6e1a2bc0d32d0273140358b977b4e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#ad94f32bbc65499df3140ee3a12f12dbc',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a84246813bd816d0adfa4751b327dbfa6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a90ffe039f52ddd5cf5e1108e7116b612',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#af129bcfb9d742a9a531ee4c3324bdb9a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#add33ba4596f143bb11a12a3508c0fb32',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#a7b80756a8fab65071212121bf535f2d3',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#aef2fbd1a40bded32e9118172ea588823',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__unweighted__nobag__kernel__warp_8cu.html#aabf9c6be454bf78678fc82ba87ed2b56',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#a2e2219247d875dadcb571833d8282ca4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#a6940aede0efad4a0cca521cfdcec433b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#ac7273b842f26b655461dfe827e4bc669',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#aaea9d9291155e312439e673a39970cc3',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#a45a1b05c68acf892f30f7ee837bb5aac',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#a526a3a91d4d22f8f4b8b25d52ea7539c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#ae9fc0a17625be30b2c3e94857e45e660',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__cta_8cu.html#a7d36afee5962e7c2e645ed580a9293d7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#af6ea5271fc0e7434bb952837a4ec992c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#adf897a86ff3ef489f638c5d6cd604fb7',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#a6562c95418573901d3dd3e933fdb1798',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#a2da0fed4926ad614ee0554b8f818854d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#a29dc67ef45e2c108c079066771ca4b15',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#a908dadafc7b1c847ac07f402090b784c',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#a861cd39a27db6459d3d308938724a605',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< float, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__rowwise__weighted__adagrad__split__weighted__kernel__warp_8cu.html#aceb4cd33e669bb98a7d191fb45221a80',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_dev, pta::PackedTensorAccessor64< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > momentum1_uvm, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > momentum1_placements, pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > momentum1_offsets, float eps, float learning_rate, float weight_decay, int64_t iter): gen_embedding_backward_rowwise_weighted_adagrad_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#a9a42f11861e28ce77032f8047e83ea11',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#a8bdc52848ae2ccea30492b4414adb034',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#a0c819af30fed201203e68ceda2eca173',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#a0720d53a4c9644a99b5cbe9e245dc3e9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#af0d4736eed64c8bbf3a20923bb9c29f9',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#ad54cfe3bdecfc6441753596772402ca3',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#a2503c0d4c5e56ba15bfb7df317dda0ac',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__cta_8cu.html#a6d8a94d5bd394aab6b93267e3f0f2673',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#a5917f6544b279539f51ba07a7d4d5ca5',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#a160f8f69b25890024d8d91dd87bbba82',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#a64537991cc98a52cb2bd884dbcc7bebc',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#ae0574dfcf396c58dc8863401720dacb5',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#a6bb05de78f7804f75e027524d191e5da',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#a64b45cd53c38d53cedf6f4d66afd11f5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#a9d44bcd45f9e02788aecbf226dbeb110',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__kernel__warp_8cu.html#a1d850e642c167b5e60a73c88a47f7f16',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#adae5d509289eae4626e7cc6eda18efbb',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#aebc5dd156def696b75e9590fdd7e44f4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a4e775aca46c2cf5dfe37c97a0c320eef',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#abff5c0eb0f3d6a4dda6f6a5f51450dad',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a7cfbc77648395dd0be255b6c2a04797e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#aedc7a4a2ea94e6294c49780531ce8562',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a5d38c3b8f12784860c0d0219684a22f2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__cta_8cu.html#a5ba8347d410dea8ce7952d7e5674a053',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#ad491955b35ee3fb84ecdbc35426aa9c6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#ada25cfa4c47b6ce54c00b842e414e5cb',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#af4e0c8de103d5b95b3930d72723dedde',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#aa7f8ec16263de0ec18ba44144f3f6409',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#ab49ef540e21a06c9366b7a4b1b643855',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#aab891f89faefe34faf30508569d63250',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a9a34cc24631cc7850723f21d44ac9bb3',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__nobag__kernel__warp_8cu.html#a0a8f9dbd0e03e001dc43109c9b58edca',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, float learning_rate): gen_embedding_backward_sgd_split_unweighted_nobag_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#a8f1dc526305df11d57d5151eb78864fd',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#acb04bf74aa1979914c837887050094ee',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#a79789b33f1e3e7e2f3908b939ae1e44c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#a3b929350b08473bf7001fb6e8d38f64d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#a027461b35f0b0e8c2245ef80575fe911',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#a81afc9990a7d79a97ddf8ee0bb84f62b',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#a98e60157f32325eabb7ce026f700e32f',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__cta_8cu.html#a6050e98a82b09a3401ba1bcefb21abdc',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#acb1714d604a523f5860b4c87e669c715',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#ab83eacf00da6299593ee678a4b1e4615',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#a4682eb9fbf137eb4577349e11559ecd0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#ab5dafd4069aae36629ecb34e3975ea6f',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#aa223add1301373e53e5b0ac08530a54e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#a46cd5ec5d4f141fe5dcce4a8b22a1aff',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#a8aee7d2ca70c048a87381106420a93b3',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__unweighted__vbe__kernel__warp_8cu.html#ac111217914f0bc07a2ec19cf00f46b52',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_unweighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#a1a5b4fc1cd662532df45be95fae00e34',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#ae8e719bacb730ff6f6f24b072264fece',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#adc76a3911b3c75253490fa732520c59d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#a0246985d6062109ceb9d0a316e236be9',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#ad84b0786ecfc63b8b6b3a1383dbfe719',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#aa5cf42df68862104a475751de18f2d7d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#ad32698d0cc220a69f7ffe6cf58fe5389',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__cta_8cu.html#af6476f9b0a8e869bb5f1fbc1c39714ca',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#ac2742fc3885cf36bb8ac4d7d4c24587f',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#a7b4db1681ed1be00464c3420ff441efa',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#a050fc99733adcb785414bd0c401d02e0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#ae79041a9602287ab549b549edc4f5040',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#a981ec80c80a0ca3713a250bca8dcfd2d',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#a8e5539e49116fc0d95e74b70fff7eb96',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#af896552004ed24a4f6289bd6321b95b3',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__kernel__warp_8cu.html#a153506803483f7484c6fc69a32b06b26',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#aae8702725dfe41086ad78bb86764b34c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#ad07aac3191ff79c34b89afd9b89305ad',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#adf78b0255c91deececdee2d30eb7f2ae',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#a3860c9b9fc99bf6f1e19426e6d95f473',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#a57427b04d21bb9e1302a85d709f94e02',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#aaa3e935211a7fd38509a279705c5e5d7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#a7ee0552285c492916b1c76b31630d3c2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< float, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__cta_8cu.html#a4ef721cf4ccbf7faeaad926427c279f7',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > num_long_run_ids, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > long_run_id_to_really_long_run_ids, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 2, at::RestrictPtrTraits > temp_grad_accum, pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > grad_accum_counter, const int32_t max_segment_length_per_cta, const bool use_deterministic_algorithms, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_cta.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#a0ab8a7e2535ae5a3f056f529bcb1071a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#a573d877b87f31127000da9bc22ad74f2',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#afcba725b1740e61675c5148dd9523082',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#a027faf7fa459ca567059607e155a1546',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#ac7a033e21d13e0bd2a2268a4086c9770',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#ab065602b705ef3209e6d4de9f8dc0bb2',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#a4342e36e81769a5d3992a7c557cb4e0e',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__backward__sgd__split__weighted__vbe__kernel__warp_8cu.html#a88f0e0bc690728b1e246b8248e9ec6e5',1,'kWarpSize(const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > grad_output, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > hash_size_cumsum, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > sorted_linear_indices_run, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_cumulative_run_lengths, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_infos, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_lxu_cache_locations, const bool use_uniq_cache_locations, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > table_unique_indices_offsets, const pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > sorted_indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > sorted_linear_indices_num_runs, int32_t max_segment_length_per_warp, bool stochastic_rounding, at::PhiloxCudaState stochastic_rounding_philox_args, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > B_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > row_output_offsets, const int32_t info_B_num_bits, const uint32_t info_B_mask, float learning_rate): gen_embedding_backward_sgd_split_weighted_vbe_kernel_warp.cu'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#afb628f9293807019a85f62216802fb27',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#a4cfe4909493e5c6c0b3272b407756da5',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#a54d7f4614b27377a702368d9be00913a',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#a8a9dfc0b7289bfa8ee20c3a9c89a1382',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#a5e36f01e2e5309c8de784ae9cf8b6995',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__kernel_8cu.html#ae45afbdb3f525626eeb8ec0c6be41f24',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#a067da40e6e91e38bb46e13bab2169087',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#a9ddc1dda2eb92f1166514ddb7da1bbc4',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#a031019a7e2638f18e08649bd6c279449',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#a63685bd7126cdab9a0d8e4046c3e150c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#a623f10d789c87a085d7c83199ac22f55',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__unweighted__nobag__kernel_8cu.html#ab30ac9e21532c639d357440a7edfc7eb',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#a3dfcd6c505c277727fdc5a5efd1f21d2',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#ab0581905b4247bac67216a78dfb722c0',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#aa21811af7885f72fe15a805872bd5a22',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#a14fb66cd776fba62200b634101140f86',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#aff669225134b913ac286c1517e039727',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#a87b17b201934f903fd2f193ac6a71629',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#abdf19a2e8c33cb0148de770a95bd662c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#a469aee03c0d8fde04842d8747ef880bb',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#a3048e1d82f672e144f218a9bc1f02bba',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#ad6d957e4c772be151a4b6c0937b71e2c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#a4199338fdc51c5f831d168e63d783674',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__dense__weighted__kernel_8cu.html#a83fe0c13753b93fbe0b623e8bc652721',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_dense_weighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#aea91359dc803899d522a74120b6d587c',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a2cae6fdef6f90d98293e7e6f2eda2138',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a254bf4fa577be3f3edb7bf1dc9339295',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#ad9e0f41f3ef8ca4cd788578980ccd083',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a0277f9514d8b9668290fe078c5ad155b',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a5da4584d7767e8c488e4e29780c3aadc',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a5ea57f49a9d1efbc601b256ec5d13107',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a93ac400107836c0de2730e3a54959ed6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a0e17c23a544e4b4ebaf07d215ece084f',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a51028935eee6951c1298eb5d7092d650',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#adfec29a5a30407f3b60408b80419baac',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__kernel_8cu.html#a0566524005bbfc2c27eac06fe4ebe955',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a568c659233485f309357ee134d1b748f',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#ae5b51047bc4a0305b636290e7ddb278a',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a3262dbb14f77bf739b020bdf79075384',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a0f68f3297710141bc57e677b3d0587ce',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a4876b4f94d323f090efef96432fc27a0',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a1c7d35447c029aba8ddce8e9532a8d82',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#ad69828fa35cf5312392a5791a435ac3f',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a0209c0fd938024beedd0716523eaa090',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a9461df0509fec5e584eaa309acb4e0ea',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a95cb4e2cdf49f5f5ba2f9a2acf3ff32d',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#af585b19bb7928041ac8b70d56c7d6f1a',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__nobag__kernel_8cu.html#a8c3130a42a235a75553eaf160ac657dc',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, int64_t D, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_nobag_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a8bd2c5adf9e33805340e4717cd9f0617',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a3d0b2d3bd9c920851a41c71817e28378',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a436f00b93c571aa3159b822122e4e781',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a17466673ca73e70a4887999d2955aaf0',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#aea632259492fcd4ba0011382bee2beea',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a4458fec1221cc0c7df2c1ef8bef422db',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#aaf16de5ee78d9de99a703cdbe61255c0',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a8835d1cabbeed24c96e827473542eea4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a931774f9fe3e608ee4b30ec8e200049b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#a2463ea08e2eade6932bdc3b08dbf3f4b',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#abdcfee895dc0dbe60f3899820e3faef6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__unweighted__vbe__kernel_8cu.html#afdce4c5ff535f039b96169a5441d66f6',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_unweighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#aefd9621d81effbb756e78929daae8517',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#a91968527cd3a341bbc8777ae41190d41',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#ac9f0e82189d5fd39e1aed1f89eb7336d',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#a61ff627d873a281ecf852f217e944c4c',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#afe7f0771d29a6a9ffd897e23dd341d7c',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#ad96d48c6eacdc0589531c48472f370dd',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#a82d9c13b59a58a367c962ccdaa95bc01',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#ab46e47b9451a78d43c7c23cf897e9445',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#af359f9c87918957f14c927e52e0d719c',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#af47656d04bdce098caf47b331b74fe2e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#a43cd667ed17b8606af1dd1f5027311a4',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__kernel_8cu.html#a99087a69215e3ecfff828e64866fb490',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, FixedDivisor fd_B, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#ad6141ba5c93e5aea872230ecd4a0d878',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a7bb186f4330ddb51696533419c414b5a',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#acf2c859f1eecda3ddf9ec37754afe3e4',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a7745e0ade9aa98a7050c6a76c59e88bc',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a51039fcb60604faf673a12fc9962de52',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a39c7a917fc74982eb89a2a6770d0be92',1,'kWarpSize(const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a094950f659c8dd934ea88348ed79dd2e',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#af4df56b4d05360a4cc547377c34a79dc',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a69613a0e40ad1ddb76bcf494c6eba437',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< float, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a039318e8b0ec66d135fcd3f9b16a4228',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< uint8_t, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#ab89613a21534acb8fe6c89a570467067',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu'],['../gen__embedding__forward__split__weighted__vbe__kernel_8cu.html#a3947e811d4918cac9bd3e70fcce80126',1,'kWarpSize(const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > dev_weights, const pta::PackedTensorAccessor64< float, 1, at::RestrictPtrTraits > uvm_weights, const pta::PackedTensorAccessor64< at::Half, 2, at::RestrictPtrTraits > lxu_cache_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > weights_placements, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > weights_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > D_offsets, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > output_offsets, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > b_t_map, const int32_t info_B_num_bits, const uint32_t info_B_mask, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices, const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > offsets, int64_t pooling_mode, pta::PackedTensorAccessor32< at::acc_type< at::Half, true >, 1, at::RestrictPtrTraits > indice_weights, const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > lxu_cache_locations, pta::PackedTensorAccessor64< float, 2, at::RestrictPtrTraits > output): gen_embedding_forward_split_weighted_vbe_kernel.cu']]] +]; diff --git a/search/functions_c.js b/search/functions_c.js new file mode 100644 index 000000000..eb5b6db5a --- /dev/null +++ b/search/functions_c.js @@ -0,0 +1,29 @@ +var searchData= +[ + ['lengths_5frange_0',['lengths_range',['../namespacefbgemm__gpu.html#a9599d315f833a6d562ee1d25d4ee5923',1,'fbgemm_gpu']]], + ['lengths_5frange_5fcuda_1',['lengths_range_cuda',['../namespacefbgemm__gpu.html#ace0a963a484e5501c50533122cdecc3c',1,'fbgemm_gpu']]], + ['lengths_5frange_5fout_2',['lengths_range_out',['../namespacefbgemm__gpu.html#a19280a435704ff4093b148460c37bc84',1,'fbgemm_gpu']]], + ['lfu_5fcache_5ffind_5funcached_5fcuda_3',['lfu_cache_find_uncached_cuda',['../namespacefbgemm__gpu.html#a9e8721a4003045038e10d3a4c8258c96',1,'fbgemm_gpu']]], + ['lfu_5fcache_5fpopulate_5fbyte_5fcpu_4',['lfu_cache_populate_byte_cpu',['../namespacefbgemm__gpu.html#a45bb3081a2688f09448ffda6bc5d5f2e',1,'fbgemm_gpu']]], + ['lfu_5fcache_5fpopulate_5fbyte_5fcuda_5',['lfu_cache_populate_byte_cuda',['../group__table-batched-embed-cuda.html#ga2b76a0cf452f00e77696d896d7a402f3',1,'lfu_cache_populate_byte_cuda(at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, int64_t row_alignment): lfu_cache_populate_byte.cu'],['../group__table-batched-embed-cuda.html#ga2b76a0cf452f00e77696d896d7a402f3',1,'lfu_cache_populate_byte_cuda(Tensor weights, Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor linear_cache_indices, Tensor lxu_cache_state, Tensor lxu_cache_weights, Tensor lfu_state, int64_t row_alignment): lfu_cache_populate_byte.cu']]], + ['lfu_5fcache_5fpopulate_5fcuda_6',['lfu_cache_populate_cuda',['../group__table-batched-embed-cuda.html#ga854b8951ef7e78da812be97041d7d2dc',1,'lfu_cache_populate_cuda(at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, bool stochastic_rounding): lfu_cache_populate.cu'],['../group__table-batched-embed-cuda.html#ga854b8951ef7e78da812be97041d7d2dc',1,'lfu_cache_populate_cuda(Tensor weights, Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor D_offsets, Tensor linear_cache_indices, Tensor lxu_cache_state, Tensor lxu_cache_weights, Tensor lfu_state, bool stochastic_rounding): lfu_cache_populate.cu']]], + ['lfu_5fupdate_5fcounts_5fcuda_7',['lfu_update_counts_cuda',['../namespacefbgemm__gpu.html#aca510adc64caa635df004e9b419bbb1b',1,'fbgemm_gpu']]], + ['linearize_5fcache_5findices_5fcpu_8',['linearize_cache_indices_cpu',['../namespacefbgemm__gpu.html#a6eaeebeb996c343db6d076fce7952133',1,'fbgemm_gpu']]], + ['linearize_5fcache_5findices_5fcuda_9',['linearize_cache_indices_cuda',['../group__table-batched-embed-cuda.html#ga23e7545e51b296d9b72c86f37c360dc6',1,'linearize_cache_indices_cuda(at::Tensor cache_hash_size_cumsum, at::Tensor indices, at::Tensor offsets): linearize_cache_indices.cu'],['../group__table-batched-embed-cuda.html#ga23e7545e51b296d9b72c86f37c360dc6',1,'linearize_cache_indices_cuda(Tensor cache_hash_size_cumsum, Tensor indices, Tensor offsets): linearize_cache_indices.cu']]], + ['linearize_5fcache_5findices_5ffrom_5frow_5fidx_5fcpu_10',['linearize_cache_indices_from_row_idx_cpu',['../namespacefbgemm__gpu.html#a9c7ab59a89fd36f5c07b9c86bdc891c8',1,'fbgemm_gpu']]], + ['linearize_5fcache_5findices_5ffrom_5frow_5fidx_5fcuda_11',['linearize_cache_indices_from_row_idx_cuda',['../group__table-batched-embed-cuda.html#ga6eed85d3e9b5dbef8a753bb81c2d6e05',1,'linearize_cache_indices_from_row_idx_cuda(at::Tensor cache_hash_size_cumsum, at::Tensor update_table_indices, at::Tensor update_row_indices): linearize_cache_indices.cu'],['../group__table-batched-embed-cuda.html#ga6eed85d3e9b5dbef8a753bb81c2d6e05',1,'linearize_cache_indices_from_row_idx_cuda(Tensor cache_hash_size_cumsum, Tensor update_table_indices, Tensor update_row_indices): linearize_cache_indices.cu']]], + ['load_12',['load',['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a858ccf060c9cb3af78e60a04c7104ff5',1,'fbgemm_gpu::Vec4T< float >::load(const float *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#afaf3bc4be251007b23417bf53b8223db',1,'fbgemm_gpu::Vec4T< float >::load(const double *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a766fc3e4e85cfdbab24e0ba390db0d55',1,'fbgemm_gpu::Vec4T< float >::load(const at::Half *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#ad96458a9ac1be72cc29c0963bf9fcb5b',1,'fbgemm_gpu::Vec4T< float >::load(const at::BFloat16 *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#a9f38e7787afcaf85c132d3b7e47ab70f',1,'fbgemm_gpu::Vec4T< float >::load(const uint8_t *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a766fc3e4e85cfdbab24e0ba390db0d55',1,'fbgemm_gpu::Vec4T< at::Half >::load(const at::Half *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#ad96458a9ac1be72cc29c0963bf9fcb5b',1,'fbgemm_gpu::Vec4T< at::Half >::load(const at::BFloat16 *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a858ccf060c9cb3af78e60a04c7104ff5',1,'fbgemm_gpu::Vec4T< at::Half >::load(const float *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#afaf3bc4be251007b23417bf53b8223db',1,'fbgemm_gpu::Vec4T< at::Half >::load(const double *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#a9f38e7787afcaf85c132d3b7e47ab70f',1,'fbgemm_gpu::Vec4T< at::Half >::load(const uint8_t *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#ad96458a9ac1be72cc29c0963bf9fcb5b',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::load(const at::BFloat16 *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a766fc3e4e85cfdbab24e0ba390db0d55',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::load(const at::Half *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a858ccf060c9cb3af78e60a04c7104ff5',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::load(const float *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#afaf3bc4be251007b23417bf53b8223db',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::load(const double *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#a9f38e7787afcaf85c132d3b7e47ab70f',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::load(const uint8_t *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a766fc3e4e85cfdbab24e0ba390db0d55',1,'fbgemm_gpu::Vec4T< double >::load(const at::Half *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#ad96458a9ac1be72cc29c0963bf9fcb5b',1,'fbgemm_gpu::Vec4T< double >::load(const at::BFloat16 *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a858ccf060c9cb3af78e60a04c7104ff5',1,'fbgemm_gpu::Vec4T< double >::load(const float *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#a9f38e7787afcaf85c132d3b7e47ab70f',1,'fbgemm_gpu::Vec4T< double >::load(const uint8_t *p)'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#afaf3bc4be251007b23417bf53b8223db',1,'fbgemm_gpu::Vec4T< double >::load(const double *p)'],['../structfbgemm__gpu_1_1_weight_row.html#a889b0ea41fd15897021ab06b2d62bf29',1,'fbgemm_gpu::WeightRow::load()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html#a2b08d5d5c065fbbe307dfa9237f58dc7',1,'fbgemm_gpu::Vec4StepT< STEP, float >::load()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html#ad300c1cf97abb3337915a7b9616b371e',1,'fbgemm_gpu::Vec4StepT< STEP, at::Half >::load()'],['../structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html#a86807843e011cecc10c8f37761f5fc20',1,'fbgemm_gpu::Vec4StepT< STEP, uint8_t >::load()']]], + ['load_5fqparams_13',['load_qparams',['../structfbgemm__gpu_1_1_weight_row.html#a5f3a7bac9f71533d09bb41e67708ffc2',1,'fbgemm_gpu::WeightRow']]], + ['load_5fqparams_5ffrom_5frow_14',['load_qparams_from_row',['../namespacefbgemm__gpu.html#a003948b9ad61509936564075f2cead23',1,'fbgemm_gpu']]], + ['lookup_5fbatched_5funary_5fembedding_5ffunction_15',['lookup_batched_unary_embedding_function',['../namespacefbgemm__gpu.html#a74ffde7bbe921424bef364880c5d57ea',1,'fbgemm_gpu']]], + ['lru_5fcache_5ffind_5funcached_5fcuda_16',['lru_cache_find_uncached_cuda',['../group__table-batched-embed-cuda.html#ga76807cfe283a9e8f258818f3f439e6cd',1,'lru_cache_find_uncached_cuda(at::Tensor unique_indices, at::Tensor unique_indices_length, int64_t max_indices, at::Tensor lxu_cache_state, int64_t time_stamp, at::Tensor lru_state, bool gather_cache_stats, at::Tensor uvm_cache_stats, bool lock_cache_line, at::Tensor lxu_cache_locking_counter): lru_cache_find.cu'],['../group__table-batched-embed-cuda.html#ga76807cfe283a9e8f258818f3f439e6cd',1,'lru_cache_find_uncached_cuda(Tensor unique_indices, Tensor unique_indices_length, int64_t max_indices, Tensor lxu_cache_state, int64_t time_stamp, Tensor lru_state, bool gather_cache_stats, Tensor uvm_cache_stats, bool lock_cache_line, Tensor lxu_cache_locking_counter): lru_cache_find.cu']]], + ['lru_5fcache_5fpopulate_5fbyte_5fcpu_17',['lru_cache_populate_byte_cpu',['../namespacefbgemm__gpu.html#a8d6ac45089730a607c2a46a265ac8b7b',1,'fbgemm_gpu']]], + ['lru_5fcache_5fpopulate_5fbyte_5fcuda_18',['lru_cache_populate_byte_cuda',['../group__table-batched-embed-cuda.html#ga5958e4cecc978d415714a3dd691fbc11',1,'lru_cache_populate_byte_cuda(at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, int64_t row_alignment, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats): split_embeddings_cache_cuda.cuh'],['../lru__cache__populate__byte_8cu.html#a53a2183d85282ab5726018767388efe8',1,'lru_cache_populate_byte_cuda(Tensor weights, Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor weights_tys, Tensor D_offsets, Tensor linear_cache_indices, Tensor lxu_cache_state, Tensor lxu_cache_weights, int64_t time_stamp, Tensor lru_state, int64_t row_alignment, bool gather_cache_stats, c10::optional< Tensor > uvm_cache_stats): lru_cache_populate_byte.cu']]], + ['lru_5fcache_5fpopulate_5fcuda_19',['lru_cache_populate_cuda',['../group__table-batched-embed-cuda.html#ga00d12767ad238d73598bf7dc4d1afa06',1,'lru_cache_populate_cuda(at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, bool stochastic_rounding, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats, bool lock_cache_line, c10::optional< at::Tensor > lxu_cache_locking_counter): split_embeddings_cache_cuda.cuh'],['../lru__cache__populate_8cu.html#ab841aec9d8660e547e492948a2ee9921',1,'lru_cache_populate_cuda(Tensor weights, Tensor cache_hash_size_cumsum, const int64_t total_cache_hash_size, Tensor cache_index_table_map, Tensor weights_offsets, Tensor D_offsets, Tensor linear_cache_indices, Tensor lxu_cache_state, Tensor lxu_cache_weights, const int64_t time_stamp, Tensor lru_state, const bool stochastic_rounding, bool gather_cache_stats, c10::optional< Tensor > uvm_cache_stats, bool lock_cache_line, c10::optional< Tensor > lxu_cache_locking_counter): lru_cache_populate.cu']]], + ['lt_20',['lt',['../structfbgemm__gpu_1_1_comparator.html#aff9ffad7ca52493418c969769327b704',1,'fbgemm_gpu::Comparator']]], + ['lxu_5fcache_5fflush_5fcuda_21',['lxu_cache_flush_cuda',['../group__table-batched-embed-cuda.html#ga2b055aeb5bf2d99bfb4351271764cab1',1,'lxu_cache_flush_cuda(at::Tensor uvm_weights, at::Tensor cache_hash_size_cumsum, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, int64_t total_D, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, bool stochastic_rounding): lxu_cache.cu'],['../group__table-batched-embed-cuda.html#ga2b055aeb5bf2d99bfb4351271764cab1',1,'lxu_cache_flush_cuda(Tensor uvm_weights, Tensor cache_hash_size_cumsum, Tensor cache_index_table_map, Tensor weights_offsets, Tensor D_offsets, int64_t total_D, Tensor lxu_cache_state, Tensor lxu_cache_weights, bool stochastic_rounding): lxu_cache.cu']]], + ['lxu_5fcache_5flocations_5fupdate_5fcuda_22',['lxu_cache_locations_update_cuda',['../group__table-batched-embed-cuda.html#ga65cba33a439fb1ed50fe2e80dc22b603',1,'lxu_cache_locations_update_cuda(at::Tensor lxu_cache_locations, at::Tensor lxu_cache_locations_new, c10::optional< at::Tensor > num_uniq_cache_indices): split_embeddings_cache_cuda.cuh'],['../lxu__cache_8cu.html#ac602137fddc0c895b176d959fa3fa8db',1,'lxu_cache_locations_update_cuda(Tensor lxu_cache_locations, Tensor lxu_cache_locations_new, c10::optional< Tensor > num_uniq_cache_indices): lxu_cache.cu']]], + ['lxu_5fcache_5flocking_5fcounter_5fdecrement_5fcuda_23',['lxu_cache_locking_counter_decrement_cuda',['../group__table-batched-embed-cuda.html#gaeaf8f13290f0fe389fefa3fc2a944311',1,'lxu_cache_locking_counter_decrement_cuda(at::Tensor lxu_cache_locking_counter, at::Tensor lxu_cache_locations): lxu_cache.cu'],['../group__table-batched-embed-cuda.html#gaeaf8f13290f0fe389fefa3fc2a944311',1,'lxu_cache_locking_counter_decrement_cuda(at::Tensor lxu_cache_locking_counter, at::Tensor lxu_cache_locations): lxu_cache.cu']]], + ['lxu_5fcache_5flookup_5fcpu_24',['lxu_cache_lookup_cpu',['../namespacefbgemm__gpu.html#ab26f1a83ce47d5510deed9bc9e9d6d9a',1,'fbgemm_gpu']]], + ['lxu_5fcache_5flookup_5fcuda_25',['lxu_cache_lookup_cuda',['../group__table-batched-embed-cuda.html#ga124b70b0fede88f508e59111ce6d765f',1,'lxu_cache_lookup_cuda(at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats, c10::optional< at::Tensor > num_uniq_cache_indices, c10::optional< at::Tensor > lxu_cache_locations_output): split_embeddings_cache_cuda.cuh'],['../lxu__cache_8cu.html#a083f4fd1219188cc40036595fa6921ab',1,'lxu_cache_lookup_cuda(const Tensor linear_cache_indices, const Tensor lxu_cache_state, const int64_t invalid_index, const bool gather_cache_stats, const c10::optional< Tensor > uvm_cache_stats, const c10::optional< Tensor > num_uniq_cache_indices, const c10::optional< Tensor > lxu_cache_locations_output): lxu_cache.cu']]] +]; diff --git a/search/functions_d.js b/search/functions_d.js new file mode 100644 index 000000000..6c51bf9fd --- /dev/null +++ b/search/functions_d.js @@ -0,0 +1,20 @@ +var searchData= +[ + ['main_0',['main',['../_c_make_c_compiler_id_8c.html#a0ddf1224851353fc92bfbff6f499fa97',1,'main(int argc, char *argv[]): CMakeCCompilerId.c'],['../_c_make_c_x_x_compiler_id_8cpp.html#a0ddf1224851353fc92bfbff6f499fa97',1,'main(int argc, char *argv[]): CMakeCXXCompilerId.cpp'],['../verify__fp16__stochastic__benchmark_8cu.html#a0ddf1224851353fc92bfbff6f499fa97',1,'main(int argc, char *argv[]): verify_fp16_stochastic_benchmark.cu']]], + ['make_5fpacked_5ftensor_5faccessor32_1',['make_packed_tensor_accessor32',['../fbgemm__tensor__accessor_8h.html#ae5c092ed88e41832d415d06d837889b3',1,'fbgemm_tensor_accessor.h']]], + ['make_5fpacked_5ftensor_5faccessor64_2',['make_packed_tensor_accessor64',['../fbgemm__tensor__accessor_8h.html#add453d9931017b7ca11b84095566ae26',1,'fbgemm_tensor_accessor.h']]], + ['make_5fzero_5ffloat2_3',['make_zero_float2',['../namespacefbgemm__gpu.html#a25e94d75c07b4c2bc5427fe771f2d60d',1,'fbgemm_gpu']]], + ['make_5fzero_5ffloat4_4',['make_zero_float4',['../namespacefbgemm__gpu.html#afca9b335bed360fc1ec3e239183a792f',1,'fbgemm_gpu']]], + ['make_5fzero_5ffloat8_5',['make_zero_float8',['../namespacefbgemm__gpu.html#a66822cc23f92dbb8c18c596511b2a917',1,'fbgemm_gpu']]], + ['make_5fzero_5ffloat_5f16_6',['make_zero_float_16',['../namespacefbgemm__gpu.html#a7dcc205dbf44fb2e80d62bf47eb6c4c4',1,'fbgemm_gpu']]], + ['masked_5findex_5fput_5fbyte_5fcuda_7',['masked_index_put_byte_cuda',['../ssd__split__table__batched__embeddings_8cpp.html#ac6846069e59fcf7c6fad94b1321b0dd0',1,'ssd_split_table_batched_embeddings.cpp']]], + ['masked_5findex_5fput_5fcuda_8',['masked_index_put_cuda',['../ssd__split__embeddings__cache__cuda_8cu.html#a8a561f5585f09252076650c0d34457d7',1,'masked_index_put_cuda(Tensor self, Tensor indices, Tensor values, Tensor count): ssd_split_embeddings_cache_cuda.cu'],['../ssd__split__table__batched__embeddings_8cpp.html#a8a561f5585f09252076650c0d34457d7',1,'masked_index_put_cuda(Tensor self, Tensor indices, Tensor values, Tensor count): ssd_split_embeddings_cache_cuda.cu']]], + ['masked_5fselect_5fjagged_5f1d_9',['masked_select_jagged_1d',['../namespacefbgemm__gpu.html#a0223abaee318471a5e42318a1b7056b6',1,'fbgemm_gpu']]], + ['max_10',['max',['../namespacefbgemm__gpu.html#a5f0a51933b0e3b1a96d8806d702ff82e',1,'fbgemm_gpu']]], + ['merge_5fpooled_5fembeddings_11',['merge_pooled_embeddings',['../namespacefbgemm__gpu.html#a25ca3ce57c9101b878431d46cc049b50',1,'fbgemm_gpu']]], + ['merge_5fpooled_5fembeddings_5fcpu_12',['merge_pooled_embeddings_cpu',['../namespacefbgemm__gpu.html#aad2aea0289bc3c5d135846ee32e0638c',1,'fbgemm_gpu']]], + ['min_13',['min',['../namespacefbgemm__gpu.html#a5b62c5028106dcf10b450a8f178338ad',1,'fbgemm_gpu']]], + ['mod_14',['Mod',['../classfbgemm__gpu_1_1_fixed_divisor.html#a604d46db75c43e0cd210e5b2ab2bc7e6',1,'fbgemm_gpu::FixedDivisor']]], + ['mul_15',['mul',['../structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html#a16f1fffe0b09a20da784cd647d11bf28',1,'fbgemm_gpu::VecNT< 1, PrimitiveType::FP >::mul()'],['../structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html#a16f1fffe0b09a20da784cd647d11bf28',1,'fbgemm_gpu::VecNT< 2, PrimitiveType::FP >::mul()'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html#a16f1fffe0b09a20da784cd647d11bf28',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::FP >::mul()'],['../structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html#a16f1fffe0b09a20da784cd647d11bf28',1,'fbgemm_gpu::VecNT< 4, PrimitiveType::INT >::mul()'],['../structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html#a16f1fffe0b09a20da784cd647d11bf28',1,'fbgemm_gpu::VecNT< 8, PrimitiveType::INT >::mul()'],['../structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html#a16f1fffe0b09a20da784cd647d11bf28',1,'fbgemm_gpu::VecNT< 16, PrimitiveType::INT >::mul()']]], + ['mul_5f_16',['mul_',['../structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html#aacd3ba9c4686c00921e3e2dcc754b000',1,'fbgemm_gpu::Vec4T< float >::mul_()'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html#aacd3ba9c4686c00921e3e2dcc754b000',1,'fbgemm_gpu::Vec4T< at::Half >::mul_()'],['../structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html#aacd3ba9c4686c00921e3e2dcc754b000',1,'fbgemm_gpu::Vec4T< at::BFloat16 >::mul_()'],['../structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html#aacd3ba9c4686c00921e3e2dcc754b000',1,'fbgemm_gpu::Vec4T< double >::mul_()']]] +]; diff --git a/search/functions_e.js b/search/functions_e.js new file mode 100644 index 000000000..c22214cda --- /dev/null +++ b/search/functions_e.js @@ -0,0 +1,11 @@ +var searchData= +[ + ['native_5fempty_5flike_0',['native_empty_like',['../namespacefbgemm__gpu.html#a2f18d44e708cafd185e02defd95fb774',1,'fbgemm_gpu']]], + ['nearest_5frounding_5fvector_1',['nearest_rounding_vector',['../namespacefbgemm__gpu.html#a94744dd15c8d4ffa9c5cf581e499f1ca',1,'fbgemm_gpu::nearest_rounding_vector(dst_t *output, const Vec4T< src_t > &value, const float2)'],['../namespacefbgemm__gpu.html#aa56064f3d743f7535d59a1baca06dc1f',1,'fbgemm_gpu::nearest_rounding_vector(uint8_t *output, const Vec4T< float > &value, const float2 qparams)'],['../namespacefbgemm__gpu.html#aefcbaad4af03b4a72b15ca0ca40bc50f',1,'fbgemm_gpu::nearest_rounding_vector(uint8_t *output, const Vec4T< at::Half > &value, const float2 qparams)'],['../namespacefbgemm__gpu.html#aa8fa436e2338f97218eff8a48c94d8a4',1,'fbgemm_gpu::nearest_rounding_vector(uint8_t *output, const Vec4T< double > &value, const float2 qparams)']]], + ['new_5fhost_5fmapped_5ftensor_2',['new_host_mapped_tensor',['../group__cumem-utils.html#ga5663643a8ac5de83063d0ff51bb9af17',1,'fbgemm_gpu']]], + ['new_5fmanaged_5ftensor_3',['new_managed_tensor',['../group__cumem-utils.html#gab708b23762a11187eb6a32a36f0e34a3',1,'fbgemm_gpu']]], + ['new_5fmanaged_5ftensor_5fmeta_4',['new_managed_tensor_meta',['../group__cumem-utils.html#ga5351c6ec3de203476cf09df330455d91',1,'fbgemm_gpu']]], + ['new_5funified_5ftensor_5',['new_unified_tensor',['../group__cumem-utils.html#ga6f8847537ea9ed13fc7e2e378bc79b1f',1,'fbgemm_gpu']]], + ['new_5funified_5ftensor_5fcpu_6',['new_unified_tensor_cpu',['../namespacefbgemm__gpu.html#aad6847fe2dc2433889aeb2dddf14f496',1,'fbgemm_gpu']]], + ['new_5fvanilla_5fmanaged_5ftensor_7',['new_vanilla_managed_tensor',['../group__cumem-utils.html#gad5e0d2307667c3db5e73f0c0eec15df5',1,'fbgemm_gpu']]] +]; diff --git a/search/functions_f.js b/search/functions_f.js new file mode 100644 index 000000000..1d555ec06 --- /dev/null +++ b/search/functions_f.js @@ -0,0 +1,7 @@ +var searchData= +[ + ['offset_5ftbe_5finput_5fcombine_5fwith_5flength_5fargs_0',['offset_tbe_input_combine_with_length_args',['../namespacefbgemm__gpu.html#ab6871043c7881b5434de1e8eea491c80',1,'fbgemm_gpu']]], + ['offsets_5frange_5fcpu_1',['offsets_range_cpu',['../namespacefbgemm__gpu.html#a5aff23a0a3b0bc872ba44a0045b6e350',1,'fbgemm_gpu']]], + ['offsets_5frange_5fcuda_2',['offsets_range_cuda',['../namespacefbgemm__gpu.html#a3d88da2f7a769565c9ebdc070467eabe',1,'fbgemm_gpu']]], + ['operator_5b_5d_3',['operator[]',['../classfbgemm__gpu_1_1_tensor_accessor.html#a72a3b6251f6388b00f3edcd8d3311600',1,'fbgemm_gpu::TensorAccessor::operator[](index_t i)'],['../classfbgemm__gpu_1_1_tensor_accessor.html#a16735630a1b17005797473122c151321',1,'fbgemm_gpu::TensorAccessor::operator[](index_t i) const'],['../classfbgemm__gpu_1_1_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html#a00a4aa208155f5c8a633eddc32351081',1,'fbgemm_gpu::TensorAccessor< T, 1, PtrTraits, index_t >::operator[](index_t i)'],['../classfbgemm__gpu_1_1_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html#a3b81b97c0e920adcd47b7f6a5b0af0cf',1,'fbgemm_gpu::TensorAccessor< T, 1, PtrTraits, index_t >::operator[](index_t i) const'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor.html#ab6e8f8fe313b1de35e94636bdd4e34dd',1,'fbgemm_gpu::GenericPackedTensorAccessor::operator[](index_t i)'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor.html#a6933e03eff2b2428f9eb67e597a520c1',1,'fbgemm_gpu::GenericPackedTensorAccessor::operator[](index_t i) const'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html#a3593eea2d954fec0db1139e509206816',1,'fbgemm_gpu::GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >::operator[](index_t i)'],['../classfbgemm__gpu_1_1_generic_packed_tensor_accessor_3_01_t_00_011_00_01_ptr_traits_00_01index__t_01_4.html#a07dee357cdcdf158224410aaf987e7d3',1,'fbgemm_gpu::GenericPackedTensorAccessor< T, 1, PtrTraits, index_t >::operator[](index_t i) const']]] +]; diff --git a/search/groups_0.js b/search/groups_0.js index 03e08f09d..3db46a5ee 100644 --- a/search/groups_0.js +++ b/search/groups_0.js @@ -1,10 +1,9 @@ var searchData= [ ['combine_20input_20operators_0',['Combine Input Operators',['../group__input-combine.html',1,'']]], - ['cpu_20operators_1',['cpu operators',['../group__embedding-cpu.html',1,'Embedding CPU Operators'],['../group__layout-transform-cpu.html',1,'Layout Transformation CPU Operators'],['../group__quantize-data-cpu.html',1,'Quantize Data CPU Operators'],['../group__sparse-data-cpu.html',1,'Sparse Data CPU Operators']]], - ['cpu_20permutation_20operators_2',['CPU Permutation Operators',['../group__permute-pooled-embs-cpu.html',1,'']]], - ['cuda_3',['Quantization Operators for CUDA',['../group__quantize-ops-cuda.html',1,'']]], - ['cuda_20memorty_20operators_4',['CUDA Memorty Operators',['../group__cumem-utils.html',1,'']]], - ['cuda_20operators_5',['cuda operators',['../group__table-batched-embed-cuda.html',1,'CUDA Operators'],['../group__embedding-cuda.html',1,'Embedding CUDA Operators'],['../group__jagged-tensor-ops-cuda.html',1,'Jagged Tensor CUDA Operators'],['../group__layout-transform-cuda.html',1,'Layout Transformation CUDA Operators'],['../group__sparse-data-cuda.html',1,'Sparse Data CUDA Operators']]], - ['cuda_20permutation_20operators_6',['CUDA Permutation Operators',['../group__permute-pooled-embs-gpu.html',1,'']]] + ['cpu_1',['Permute Pooled Embeddings Operators (CPU)',['../group__permute-pooled-embs-cpu.html',1,'']]], + ['cpu_20operators_2',['CPU Operators',['../group__embedding-cpu.html',1,'Embedding CPU Operators'],['../group__layout-transform-cpu.html',1,'Layout Transformation CPU Operators'],['../group__quantize-data-cpu.html',1,'Quantize Data CPU Operators'],['../group__sparse-data-cpu.html',1,'Sparse Data CPU Operators']]], + ['cuda_3',['CUDA',['../group__permute-pooled-embs-gpu.html',1,'Permute Pooled Embeddings Operators (CUDA)'],['../group__quantize-ops-cuda.html',1,'Quantization Operators (CUDA)']]], + ['cuda_20memory_20operators_4',['CUDA Memory Operators',['../group__cumem-utils.html',1,'']]], + ['cuda_20operators_5',['CUDA Operators',['../group__table-batched-embed-cuda.html',1,'CUDA Operators'],['../group__embedding-cuda.html',1,'Embedding CUDA Operators'],['../group__jagged-tensor-ops-cuda.html',1,'Jagged Tensor CUDA Operators'],['../group__layout-transform-cuda.html',1,'Layout Transformation CUDA Operators'],['../group__sparse-data-cuda.html',1,'Sparse Data CUDA Operators']]] ]; diff --git a/search/groups_1.js b/search/groups_1.js index a3a30195c..e7441b2cf 100644 --- a/search/groups_1.js +++ b/search/groups_1.js @@ -1,5 +1,5 @@ var searchData= [ - ['data_20cpu_20operators_0',['data cpu operators',['../group__quantize-data-cpu.html',1,'Quantize Data CPU Operators'],['../group__sparse-data-cpu.html',1,'Sparse Data CPU Operators']]], + ['data_20cpu_20operators_0',['Data CPU Operators',['../group__quantize-data-cpu.html',1,'Quantize Data CPU Operators'],['../group__sparse-data-cpu.html',1,'Sparse Data CPU Operators']]], ['data_20cuda_20operators_1',['Sparse Data CUDA Operators',['../group__sparse-data-cuda.html',1,'']]] ]; diff --git a/search/groups_2.js b/search/groups_2.js index 6e3a28acb..cfc6b1a56 100644 --- a/search/groups_2.js +++ b/search/groups_2.js @@ -1,5 +1,7 @@ var searchData= [ ['embedding_20cpu_20operators_0',['Embedding CPU Operators',['../group__embedding-cpu.html',1,'']]], - ['embedding_20cuda_20operators_1',['Embedding CUDA Operators',['../group__embedding-cuda.html',1,'']]] + ['embedding_20cuda_20operators_1',['Embedding CUDA Operators',['../group__embedding-cuda.html',1,'']]], + ['embeddings_20operators_20cpu_2',['Permute Pooled Embeddings Operators (CPU)',['../group__permute-pooled-embs-cpu.html',1,'']]], + ['embeddings_20operators_20cuda_3',['Permute Pooled Embeddings Operators (CUDA)',['../group__permute-pooled-embs-gpu.html',1,'']]] ]; diff --git a/search/groups_3.js b/search/groups_3.js index 7e14db79a..f46d26665 100644 --- a/search/groups_3.js +++ b/search/groups_3.js @@ -1,4 +1,4 @@ var searchData= [ - ['for_20cuda_0',['Quantization Operators for CUDA',['../group__quantize-ops-cuda.html',1,'']]] + ['input_20operators_0',['Combine Input Operators',['../group__input-combine.html',1,'']]] ]; diff --git a/search/groups_4.js b/search/groups_4.js index f46d26665..1a565975f 100644 --- a/search/groups_4.js +++ b/search/groups_4.js @@ -1,4 +1,5 @@ var searchData= [ - ['input_20operators_0',['Combine Input Operators',['../group__input-combine.html',1,'']]] + ['jagged_20tensor_20cuda_20operators_0',['Jagged Tensor CUDA Operators',['../group__jagged-tensor-ops-cuda.html',1,'']]], + ['jagged_20tensor_20operators_1',['Jagged Tensor Operators',['../group__jagged-tensor-ops-cpu.html',1,'']]] ]; diff --git a/search/groups_5.js b/search/groups_5.js index 1a565975f..996ae18ad 100644 --- a/search/groups_5.js +++ b/search/groups_5.js @@ -1,5 +1,5 @@ var searchData= [ - ['jagged_20tensor_20cuda_20operators_0',['Jagged Tensor CUDA Operators',['../group__jagged-tensor-ops-cuda.html',1,'']]], - ['jagged_20tensor_20operators_1',['Jagged Tensor Operators',['../group__jagged-tensor-ops-cpu.html',1,'']]] + ['layout_20transformation_20cpu_20operators_0',['Layout Transformation CPU Operators',['../group__layout-transform-cpu.html',1,'']]], + ['layout_20transformation_20cuda_20operators_1',['Layout Transformation CUDA Operators',['../group__layout-transform-cuda.html',1,'']]] ]; diff --git a/search/groups_6.js b/search/groups_6.js index 996ae18ad..817f53e67 100644 --- a/search/groups_6.js +++ b/search/groups_6.js @@ -1,5 +1,5 @@ var searchData= [ - ['layout_20transformation_20cpu_20operators_0',['Layout Transformation CPU Operators',['../group__layout-transform-cpu.html',1,'']]], - ['layout_20transformation_20cuda_20operators_1',['Layout Transformation CUDA Operators',['../group__layout-transform-cuda.html',1,'']]] + ['memory_20operators_0',['CUDA Memory Operators',['../group__cumem-utils.html',1,'']]], + ['merge_20operators_1',['Merge Operators',['../group__merge-pooled-emb.html',1,'']]] ]; diff --git a/search/groups_7.js b/search/groups_7.js index 4ddf8079f..7d1f3b1d8 100644 --- a/search/groups_7.js +++ b/search/groups_7.js @@ -1,5 +1,6 @@ var searchData= [ - ['memorty_20operators_0',['CUDA Memorty Operators',['../group__cumem-utils.html',1,'']]], - ['merge_20operators_1',['Merge Operators',['../group__merge-pooled-emb.html',1,'']]] + ['operators_0',['Operators',['../group__input-combine.html',1,'Combine Input Operators'],['../group__cumem-utils.html',1,'CUDA Memory Operators'],['../group__table-batched-embed-cuda.html',1,'CUDA Operators'],['../group__embedding-cpu.html',1,'Embedding CPU Operators'],['../group__embedding-cuda.html',1,'Embedding CUDA Operators'],['../group__jagged-tensor-ops-cuda.html',1,'Jagged Tensor CUDA Operators'],['../group__jagged-tensor-ops-cpu.html',1,'Jagged Tensor Operators'],['../group__layout-transform-cpu.html',1,'Layout Transformation CPU Operators'],['../group__layout-transform-cuda.html',1,'Layout Transformation CUDA Operators'],['../group__merge-pooled-emb.html',1,'Merge Operators'],['../group__quantize-data-cpu.html',1,'Quantize Data CPU Operators'],['../group__sparse-data-cpu.html',1,'Sparse Data CPU Operators'],['../group__sparse-data-cuda.html',1,'Sparse Data CUDA Operators']]], + ['operators_20cpu_1',['Permute Pooled Embeddings Operators (CPU)',['../group__permute-pooled-embs-cpu.html',1,'']]], + ['operators_20cuda_2',['Operators CUDA',['../group__permute-pooled-embs-gpu.html',1,'Permute Pooled Embeddings Operators (CUDA)'],['../group__quantize-ops-cuda.html',1,'Quantization Operators (CUDA)']]] ]; diff --git a/search/groups_8.js b/search/groups_8.js index 1e64f383c..439fdcf54 100644 --- a/search/groups_8.js +++ b/search/groups_8.js @@ -1,5 +1,7 @@ var searchData= [ - ['operators_0',['operators',['../group__input-combine.html',1,'Combine Input Operators'],['../group__permute-pooled-embs-cpu.html',1,'CPU Permutation Operators'],['../group__cumem-utils.html',1,'CUDA Memorty Operators'],['../group__table-batched-embed-cuda.html',1,'CUDA Operators'],['../group__permute-pooled-embs-gpu.html',1,'CUDA Permutation Operators'],['../group__embedding-cpu.html',1,'Embedding CPU Operators'],['../group__embedding-cuda.html',1,'Embedding CUDA Operators'],['../group__jagged-tensor-ops-cuda.html',1,'Jagged Tensor CUDA Operators'],['../group__jagged-tensor-ops-cpu.html',1,'Jagged Tensor Operators'],['../group__layout-transform-cpu.html',1,'Layout Transformation CPU Operators'],['../group__layout-transform-cuda.html',1,'Layout Transformation CUDA Operators'],['../group__merge-pooled-emb.html',1,'Merge Operators'],['../group__quantize-data-cpu.html',1,'Quantize Data CPU Operators'],['../group__sparse-data-cpu.html',1,'Sparse Data CPU Operators'],['../group__sparse-data-cuda.html',1,'Sparse Data CUDA Operators']]], - ['operators_20for_20cuda_1',['Quantization Operators for CUDA',['../group__quantize-ops-cuda.html',1,'']]] + ['permute_20pooled_20embeddings_20operators_20cpu_0',['Permute Pooled Embeddings Operators (CPU)',['../group__permute-pooled-embs-cpu.html',1,'']]], + ['permute_20pooled_20embeddings_20operators_20cuda_1',['Permute Pooled Embeddings Operators (CUDA)',['../group__permute-pooled-embs-gpu.html',1,'']]], + ['pooled_20embeddings_20operators_20cpu_2',['Permute Pooled Embeddings Operators (CPU)',['../group__permute-pooled-embs-cpu.html',1,'']]], + ['pooled_20embeddings_20operators_20cuda_3',['Permute Pooled Embeddings Operators (CUDA)',['../group__permute-pooled-embs-gpu.html',1,'']]] ]; diff --git a/search/groups_9.js b/search/groups_9.js index 12d4e8ad5..b57784239 100644 --- a/search/groups_9.js +++ b/search/groups_9.js @@ -1,4 +1,5 @@ var searchData= [ - ['permutation_20operators_0',['permutation operators',['../group__permute-pooled-embs-cpu.html',1,'CPU Permutation Operators'],['../group__permute-pooled-embs-gpu.html',1,'CUDA Permutation Operators']]] + ['quantization_20operators_20cuda_0',['Quantization Operators (CUDA)',['../group__quantize-ops-cuda.html',1,'']]], + ['quantize_20data_20cpu_20operators_1',['Quantize Data CPU Operators',['../group__quantize-data-cpu.html',1,'']]] ]; diff --git a/search/groups_a.js b/search/groups_a.js index 7c6f6fc5f..202af0400 100644 --- a/search/groups_a.js +++ b/search/groups_a.js @@ -1,5 +1,5 @@ var searchData= [ - ['quantization_20operators_20for_20cuda_0',['Quantization Operators for CUDA',['../group__quantize-ops-cuda.html',1,'']]], - ['quantize_20data_20cpu_20operators_1',['Quantize Data CPU Operators',['../group__quantize-data-cpu.html',1,'']]] + ['sparse_20data_20cpu_20operators_0',['Sparse Data CPU Operators',['../group__sparse-data-cpu.html',1,'']]], + ['sparse_20data_20cuda_20operators_1',['Sparse Data CUDA Operators',['../group__sparse-data-cuda.html',1,'']]] ]; diff --git a/search/groups_b.js b/search/groups_b.js index 202af0400..7146073cb 100644 --- a/search/groups_b.js +++ b/search/groups_b.js @@ -1,5 +1,7 @@ var searchData= [ - ['sparse_20data_20cpu_20operators_0',['Sparse Data CPU Operators',['../group__sparse-data-cpu.html',1,'']]], - ['sparse_20data_20cuda_20operators_1',['Sparse Data CUDA Operators',['../group__sparse-data-cuda.html',1,'']]] + ['tensor_20cuda_20operators_0',['Jagged Tensor CUDA Operators',['../group__jagged-tensor-ops-cuda.html',1,'']]], + ['tensor_20operators_1',['Jagged Tensor Operators',['../group__jagged-tensor-ops-cpu.html',1,'']]], + ['transformation_20cpu_20operators_2',['Layout Transformation CPU Operators',['../group__layout-transform-cpu.html',1,'']]], + ['transformation_20cuda_20operators_3',['Layout Transformation CUDA Operators',['../group__layout-transform-cuda.html',1,'']]] ]; diff --git a/search/groups_c.js b/search/groups_c.js deleted file mode 100644 index 7146073cb..000000000 --- a/search/groups_c.js +++ /dev/null @@ -1,7 +0,0 @@ -var searchData= -[ - ['tensor_20cuda_20operators_0',['Jagged Tensor CUDA Operators',['../group__jagged-tensor-ops-cuda.html',1,'']]], - ['tensor_20operators_1',['Jagged Tensor Operators',['../group__jagged-tensor-ops-cpu.html',1,'']]], - ['transformation_20cpu_20operators_2',['Layout Transformation CPU Operators',['../group__layout-transform-cpu.html',1,'']]], - ['transformation_20cuda_20operators_3',['Layout Transformation CUDA Operators',['../group__layout-transform-cuda.html',1,'']]] -]; diff --git a/search/namespaces_0.js b/search/namespaces_0.js new file mode 100644 index 000000000..82a889448 --- /dev/null +++ b/search/namespaces_0.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['fbgemm_5fgpu_0',['fbgemm_gpu',['../namespacefbgemm__gpu.html',1,'']]] +]; diff --git a/search/namespaces_1.js b/search/namespaces_1.js new file mode 100644 index 000000000..f6ba93a89 --- /dev/null +++ b/search/namespaces_1.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['internal_0',['internal',['../namespaceinternal.html',1,'']]] +]; diff --git a/search/namespaces_2.js b/search/namespaces_2.js new file mode 100644 index 000000000..cc783109d --- /dev/null +++ b/search/namespaces_2.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['nbit_0',['nbit',['../namespacenbit.html',1,'']]] +]; diff --git a/search/namespaces_3.js b/search/namespaces_3.js new file mode 100644 index 000000000..bde728ac9 --- /dev/null +++ b/search/namespaces_3.js @@ -0,0 +1,4 @@ +var searchData= +[ + ['ssd_0',['ssd',['../namespacessd.html',1,'']]] +]; diff --git a/search/search.js b/search/search.js index 6fd40c677..666af01e5 100644 --- a/search/search.js +++ b/search/search.js @@ -22,58 +22,9 @@ @licend The above is the entire license notice for the JavaScript code in this file */ -function convertToId(search) -{ - var result = ''; - for (i=0;i document.getElementById("MSearchField"); + this.DOMSearchSelect = () => document.getElementById("MSearchSelect"); + this.DOMSearchSelectWindow = () => document.getElementById("MSearchSelectWindow"); + this.DOMPopupSearchResults = () => document.getElementById("MSearchResults"); + this.DOMPopupSearchResultsWindow = () => document.getElementById("MSearchResultsWindow"); + this.DOMSearchClose = () => document.getElementById("MSearchClose"); + this.DOMSearchBox = () => document.getElementById("MSearchBox"); // ------------ Event Handlers // Called when focus is added or removed from the search field. - this.OnSearchFieldFocus = function(isActive) - { + this.OnSearchFieldFocus = function(isActive) { this.Activate(isActive); } - this.OnSearchSelectShow = function() - { - var searchSelectWindow = this.DOMSearchSelectWindow(); - var searchField = this.DOMSearchSelect(); + this.OnSearchSelectShow = function() { + const searchSelectWindow = this.DOMSearchSelectWindow(); + const searchField = this.DOMSearchSelect(); - var left = getXPos(searchField); - var top = getYPos(searchField); - top += searchField.offsetHeight; + const left = getXPos(searchField); + const top = getYPos(searchField) + searchField.offsetHeight; // show search selection popup searchSelectWindow.style.display='block'; @@ -146,55 +102,43 @@ function SearchBox(name, resultsPath, extension) searchSelectWindow.style.top = top + 'px'; // stop selection hide timer - if (this.hideTimeout) - { + if (this.hideTimeout) { clearTimeout(this.hideTimeout); this.hideTimeout=0; } return false; // to avoid "image drag" default event } - this.OnSearchSelectHide = function() - { + this.OnSearchSelectHide = function() { this.hideTimeout = setTimeout(this.CloseSelectionWindow.bind(this), this.closeSelectionTimeout); } // Called when the content of the search field is changed. - this.OnSearchFieldChange = function(evt) - { - if (this.keyTimeout) // kill running timer - { + this.OnSearchFieldChange = function(evt) { + if (this.keyTimeout) { // kill running timer clearTimeout(this.keyTimeout); this.keyTimeout = 0; } - var e = (evt) ? evt : window.event; // for IE - if (e.keyCode==40 || e.keyCode==13) - { - if (e.shiftKey==1) - { + const e = evt ? evt : window.event; // for IE + if (e.keyCode==40 || e.keyCode==13) { + if (e.shiftKey==1) { this.OnSearchSelectShow(); - var win=this.DOMSearchSelectWindow(); - for (i=0;i do a search - { + const searchValue = this.DOMSearchField().value.replace(/ +/g, ""); + if (searchValue!="" && this.searchActive) { // something was found -> do a search this.Search(); } } - this.OnSearchSelectKey = function(evt) - { - var e = (evt) ? evt : window.event; // for IE - if (e.keyCode==40 && this.searchIndex0) // Up - { + } else if (e.keyCode==38 && this.searchIndex>0) { // Up this.searchIndex--; this.OnSelectItem(this.searchIndex); - } - else if (e.keyCode==13 || e.keyCode==27) - { + } else if (e.keyCode==13 || e.keyCode==27) { e.stopPropagation(); this.OnSelectItem(this.searchIndex); this.CloseSelectionWindow(); @@ -301,82 +239,75 @@ function SearchBox(name, resultsPath, extension) // --------- Actions // Closes the results window. - this.CloseResultsWindow = function() - { + this.CloseResultsWindow = function() { this.DOMPopupSearchResultsWindow().style.display = 'none'; this.DOMSearchClose().style.display = 'none'; this.Activate(false); } - this.CloseSelectionWindow = function() - { + this.CloseSelectionWindow = function() { this.DOMSearchSelectWindow().style.display = 'none'; } // Performs a search. - this.Search = function() - { + this.Search = function() { this.keyTimeout = 0; // strip leading whitespace - var searchValue = this.DOMSearchField().value.replace(/^ +/, ""); + const searchValue = this.DOMSearchField().value.replace(/^ +/, ""); - var code = searchValue.toLowerCase().charCodeAt(0); - var idxChar = searchValue.substr(0, 1).toLowerCase(); - if ( 0xD800 <= code && code <= 0xDBFF && searchValue > 1) // surrogate pair - { + const code = searchValue.toLowerCase().charCodeAt(0); + let idxChar = searchValue.substr(0, 1).toLowerCase(); + if ( 0xD800 <= code && code <= 0xDBFF && searchValue > 1) { // surrogate pair idxChar = searchValue.substr(0, 2); } - var jsFile; - - var idx = indexSectionsWithContent[this.searchIndex].indexOf(idxChar); - if (idx!=-1) - { - var hexCode=idx.toString(16); - jsFile = this.resultsPath + indexSectionNames[this.searchIndex] + '_' + hexCode + '.js'; + let jsFile; + let idx = indexSectionsWithContent[this.searchIndex].indexOf(idxChar); + if (idx!=-1) { + const hexCode=idx.toString(16); + jsFile = this.resultsPath + indexSectionNames[this.searchIndex] + '_' + hexCode + '.js'; } - var loadJS = function(url, impl, loc){ - var scriptTag = document.createElement('script'); + const loadJS = function(url, impl, loc) { + const scriptTag = document.createElement('script'); scriptTag.src = url; scriptTag.onload = impl; scriptTag.onreadystatechange = impl; loc.appendChild(scriptTag); } - var domPopupSearchResultsWindow = this.DOMPopupSearchResultsWindow(); - var domSearchBox = this.DOMSearchBox(); - var domPopupSearchResults = this.DOMPopupSearchResults(); - var domSearchClose = this.DOMSearchClose(); - var resultsPath = this.resultsPath; + const domPopupSearchResultsWindow = this.DOMPopupSearchResultsWindow(); + const domSearchBox = this.DOMSearchBox(); + const domPopupSearchResults = this.DOMPopupSearchResults(); + const domSearchClose = this.DOMSearchClose(); + const resultsPath = this.resultsPath; - var handleResults = function() { + const handleResults = function() { document.getElementById("Loading").style.display="none"; if (typeof searchData !== 'undefined') { createResults(resultsPath); document.getElementById("NoMatches").style.display="none"; } - + if (idx!=-1) { searchResults.Search(searchValue); } else { // no file with search results => force empty search results searchResults.Search('===='); } - if (domPopupSearchResultsWindow.style.display!='block') - { + if (domPopupSearchResultsWindow.style.display!='block') { domSearchClose.style.display = 'inline-block'; - var left = getXPos(domSearchBox) + 150; - var top = getYPos(domSearchBox) + 20; + let left = getXPos(domSearchBox) + 150; + let top = getYPos(domSearchBox) + 20; domPopupSearchResultsWindow.style.display = 'block'; left -= domPopupSearchResults.offsetWidth; - var maxWidth = document.body.clientWidth; - var maxHeight = document.body.clientHeight; - var width = 300; + const maxWidth = document.body.clientWidth; + const maxHeight = document.body.clientHeight; + let width = 300; if (left<10) left=10; if (width+left+8>maxWidth) width=maxWidth-left-8; - var height = 400; + let height = 400; if (height+top+8>maxHeight) height=maxHeight-top-8; domPopupSearchResultsWindow.style.top = top + 'px'; domPopupSearchResultsWindow.style.left = left + 'px'; @@ -398,17 +329,13 @@ function SearchBox(name, resultsPath, extension) // Activates or deactivates the search panel, resetting things to // their default values if necessary. - this.Activate = function(isActive) - { + this.Activate = function(isActive) { if (isActive || // open it - this.DOMPopupSearchResultsWindow().style.display == 'block' - ) - { + this.DOMPopupSearchResultsWindow().style.display == 'block' + ) { this.DOMSearchBox().className = 'MSearchBoxActive'; this.searchActive = true; - } - else if (!isActive) // directly remove the panel - { + } else if (!isActive) { // directly remove the panel this.DOMSearchBox().className = 'MSearchBoxInactive'; this.searchActive = false; this.lastSearchValue = '' @@ -421,409 +348,333 @@ function SearchBox(name, resultsPath, extension) // ----------------------------------------------------------------------- // The class that handles everything on the search results page. -function SearchResults(name) -{ - // The number of matches from the last run of . - this.lastMatchCount = 0; - this.lastKey = 0; - this.repeatOn = false; - - // Toggles the visibility of the passed element ID. - this.FindChildElement = function(id) - { - var parentElement = document.getElementById(id); - var element = parentElement.firstChild; - - while (element && element!=parentElement) - { - if (element.nodeName.toLowerCase() == 'div' && element.className == 'SRChildren') - { - return element; - } +function SearchResults() { + + function convertToId(search) { + let result = ''; + for (let i=0;i. + this.lastMatchCount = 0; + this.lastKey = 0; + this.repeatOn = false; - if (element && element!=parentElement) - { - element = element.nextSibling; - } - } + // Toggles the visibility of the passed element ID. + this.FindChildElement = function(id) { + const parentElement = document.getElementById(id); + let element = parentElement.firstChild; + + while (element && element!=parentElement) { + if (element.nodeName.toLowerCase() == 'div' && element.className == 'SRChildren') { + return element; } - } - this.Toggle = function(id) - { - var element = this.FindChildElement(id); - if (element) - { - if (element.style.display == 'block') - { - element.style.display = 'none'; + if (element.nodeName.toLowerCase() == 'div' && element.hasChildNodes()) { + element = element.firstChild; + } else if (element.nextSibling) { + element = element.nextSibling; + } else { + do { + element = element.parentNode; } - else - { - element.style.display = 'block'; + while (element && element!=parentElement && !element.nextSibling); + + if (element && element!=parentElement) { + element = element.nextSibling; } } } + } - // Searches for the passed string. If there is no parameter, - // it takes it from the URL query. - // - // Always returns true, since other documents may try to call it - // and that may or may not be possible. - this.Search = function(search) - { - if (!search) // get search word from URL - { - search = window.location.search; - search = search.substring(1); // Remove the leading '?' - search = unescape(search); - } - - search = search.replace(/^ +/, ""); // strip leading spaces - search = search.replace(/ +$/, ""); // strip trailing spaces - search = search.toLowerCase(); - search = convertToId(search); - - var resultRows = document.getElementsByTagName("div"); - var matches = 0; - - var i = 0; - while (i < resultRows.length) - { - var row = resultRows.item(i); - if (row.className == "SRResult") - { - var rowMatchName = row.id.toLowerCase(); - rowMatchName = rowMatchName.replace(/^sr\d*_/, ''); // strip 'sr123_' - - if (search.length<=rowMatchName.length && - rowMatchName.substr(0, search.length)==search) - { - row.style.display = 'block'; - matches++; - } - else - { - row.style.display = 'none'; - } - } - i++; - } - document.getElementById("Searching").style.display='none'; - if (matches == 0) // no results - { - document.getElementById("NoMatches").style.display='block'; - } - else // at least one result - { - document.getElementById("NoMatches").style.display='none'; + this.Toggle = function(id) { + const element = this.FindChildElement(id); + if (element) { + if (element.style.display == 'block') { + element.style.display = 'none'; + } else { + element.style.display = 'block'; } - this.lastMatchCount = matches; - return true; } + } - // return the first item with index index or higher that is visible - this.NavNext = function(index) - { - var focusItem; - while (1) - { - var focusName = 'Item'+index; - focusItem = document.getElementById(focusName); - if (focusItem && focusItem.parentNode.parentNode.style.display=='block') - { - break; - } - else if (!focusItem) // last element - { - break; + // Searches for the passed string. If there is no parameter, + // it takes it from the URL query. + // + // Always returns true, since other documents may try to call it + // and that may or may not be possible. + this.Search = function(search) { + if (!search) { // get search word from URL + search = window.location.search; + search = search.substring(1); // Remove the leading '?' + search = unescape(search); + } + + search = search.replace(/^ +/, ""); // strip leading spaces + search = search.replace(/ +$/, ""); // strip trailing spaces + search = search.toLowerCase(); + search = convertToId(search); + + const resultRows = document.getElementsByTagName("div"); + let matches = 0; + + let i = 0; + while (i < resultRows.length) { + const row = resultRows.item(i); + if (row.className == "SRResult") { + let rowMatchName = row.id.toLowerCase(); + rowMatchName = rowMatchName.replace(/^sr\d*_/, ''); // strip 'sr123_' + + if (search.length<=rowMatchName.length && + rowMatchName.substr(0, search.length)==search) { + row.style.display = 'block'; + matches++; + } else { + row.style.display = 'none'; } - focusItem=null; - index++; } - return focusItem; + i++; } + document.getElementById("Searching").style.display='none'; + if (matches == 0) { // no results + document.getElementById("NoMatches").style.display='block'; + } else { // at least one result + document.getElementById("NoMatches").style.display='none'; + } + this.lastMatchCount = matches; + return true; + } - this.NavPrev = function(index) - { - var focusItem; - while (1) - { - var focusName = 'Item'+index; - focusItem = document.getElementById(focusName); - if (focusItem && focusItem.parentNode.parentNode.style.display=='block') - { - break; - } - else if (!focusItem) // last element - { - break; - } - focusItem=null; - index--; + // return the first item with index index or higher that is visible + this.NavNext = function(index) { + let focusItem; + for (;;) { + const focusName = 'Item'+index; + focusItem = document.getElementById(focusName); + if (focusItem && focusItem.parentNode.parentNode.style.display=='block') { + break; + } else if (!focusItem) { // last element + break; + } + focusItem=null; + index++; + } + return focusItem; + } + + this.NavPrev = function(index) { + let focusItem; + for (;;) { + const focusName = 'Item'+index; + focusItem = document.getElementById(focusName); + if (focusItem && focusItem.parentNode.parentNode.style.display=='block') { + break; + } else if (!focusItem) { // last element + break; } - return focusItem; + focusItem=null; + index--; } + return focusItem; + } - this.ProcessKeys = function(e) - { - if (e.type == "keydown") - { - this.repeatOn = false; - this.lastKey = e.keyCode; - } - else if (e.type == "keypress") - { - if (!this.repeatOn) - { - if (this.lastKey) this.repeatOn = true; - return false; // ignore first keypress after keydown - } - } - else if (e.type == "keyup") - { - this.lastKey = 0; - this.repeatOn = false; + this.ProcessKeys = function(e) { + if (e.type == "keydown") { + this.repeatOn = false; + this.lastKey = e.keyCode; + } else if (e.type == "keypress") { + if (!this.repeatOn) { + if (this.lastKey) this.repeatOn = true; + return false; // ignore first keypress after keydown } - return this.lastKey!=0; + } else if (e.type == "keyup") { + this.lastKey = 0; + this.repeatOn = false; } + return this.lastKey!=0; + } - this.Nav = function(evt,itemIndex) - { - var e = (evt) ? evt : window.event; // for IE - if (e.keyCode==13) return true; - if (!this.ProcessKeys(e)) return false; - - if (this.lastKey==38) // Up - { - var newIndex = itemIndex-1; - var focusItem = this.NavPrev(newIndex); - if (focusItem) - { - var child = this.FindChildElement(focusItem.parentNode.parentNode.id); - if (child && child.style.display == 'block') // children visible - { - var n=0; - var tmpElem; - while (1) // search for last child - { - tmpElem = document.getElementById('Item'+newIndex+'_c'+n); - if (tmpElem) - { - focusItem = tmpElem; - } - else // found it! - { - break; - } - n++; + this.Nav = function(evt,itemIndex) { + const e = (evt) ? evt : window.event; // for IE + if (e.keyCode==13) return true; + if (!this.ProcessKeys(e)) return false; + + if (this.lastKey==38) { // Up + const newIndex = itemIndex-1; + let focusItem = this.NavPrev(newIndex); + if (focusItem) { + let child = this.FindChildElement(focusItem.parentNode.parentNode.id); + if (child && child.style.display == 'block') { // children visible + let n=0; + let tmpElem; + for (;;) { // search for last child + tmpElem = document.getElementById('Item'+newIndex+'_c'+n); + if (tmpElem) { + focusItem = tmpElem; + } else { // found it! + break; } + n++; } } - if (focusItem) - { - focusItem.focus(); - } - else // return focus to search field - { - document.getElementById("MSearchField").focus(); - } - } - else if (this.lastKey==40) // Down - { - var newIndex = itemIndex+1; - var focusItem; - var item = document.getElementById('Item'+itemIndex); - var elem = this.FindChildElement(item.parentNode.parentNode.id); - if (elem && elem.style.display == 'block') // children visible - { - focusItem = document.getElementById('Item'+itemIndex+'_c0'); - } - if (!focusItem) focusItem = this.NavNext(newIndex); - if (focusItem) focusItem.focus(); - } - else if (this.lastKey==39) // Right - { - var item = document.getElementById('Item'+itemIndex); - var elem = this.FindChildElement(item.parentNode.parentNode.id); - if (elem) elem.style.display = 'block'; - } - else if (this.lastKey==37) // Left - { - var item = document.getElementById('Item'+itemIndex); - var elem = this.FindChildElement(item.parentNode.parentNode.id); - if (elem) elem.style.display = 'none'; } - else if (this.lastKey==27) // Escape - { - e.stopPropagation(); - searchBox.CloseResultsWindow(); + if (focusItem) { + focusItem.focus(); + } else { // return focus to search field document.getElementById("MSearchField").focus(); } - else if (this.lastKey==13) // Enter - { - return true; - } - return false; + } else if (this.lastKey==40) { // Down + const newIndex = itemIndex+1; + let focusItem; + const item = document.getElementById('Item'+itemIndex); + const elem = this.FindChildElement(item.parentNode.parentNode.id); + if (elem && elem.style.display == 'block') { // children visible + focusItem = document.getElementById('Item'+itemIndex+'_c0'); + } + if (!focusItem) focusItem = this.NavNext(newIndex); + if (focusItem) focusItem.focus(); + } else if (this.lastKey==39) { // Right + const item = document.getElementById('Item'+itemIndex); + const elem = this.FindChildElement(item.parentNode.parentNode.id); + if (elem) elem.style.display = 'block'; + } else if (this.lastKey==37) { // Left + const item = document.getElementById('Item'+itemIndex); + const elem = this.FindChildElement(item.parentNode.parentNode.id); + if (elem) elem.style.display = 'none'; + } else if (this.lastKey==27) { // Escape + e.stopPropagation(); + searchBox.CloseResultsWindow(); + document.getElementById("MSearchField").focus(); + } else if (this.lastKey==13) { // Enter + return true; } + return false; + } - this.NavChild = function(evt,itemIndex,childIndex) - { - var e = (evt) ? evt : window.event; // for IE - if (e.keyCode==13) return true; - if (!this.ProcessKeys(e)) return false; - - if (this.lastKey==38) // Up - { - if (childIndex>0) - { - var newIndex = childIndex-1; - document.getElementById('Item'+itemIndex+'_c'+newIndex).focus(); - } - else // already at first child, jump to parent - { - document.getElementById('Item'+itemIndex).focus(); - } + this.NavChild = function(evt,itemIndex,childIndex) { + const e = (evt) ? evt : window.event; // for IE + if (e.keyCode==13) return true; + if (!this.ProcessKeys(e)) return false; + + if (this.lastKey==38) { // Up + if (childIndex>0) { + const newIndex = childIndex-1; + document.getElementById('Item'+itemIndex+'_c'+newIndex).focus(); + } else { // already at first child, jump to parent + document.getElementById('Item'+itemIndex).focus(); } - else if (this.lastKey==40) // Down - { - var newIndex = childIndex+1; - var elem = document.getElementById('Item'+itemIndex+'_c'+newIndex); - if (!elem) // last child, jump to parent next parent - { - elem = this.NavNext(itemIndex+1); - } - if (elem) - { - elem.focus(); - } + } else if (this.lastKey==40) { // Down + const newIndex = childIndex+1; + let elem = document.getElementById('Item'+itemIndex+'_c'+newIndex); + if (!elem) { // last child, jump to parent next parent + elem = this.NavNext(itemIndex+1); } - else if (this.lastKey==27) // Escape - { - e.stopPropagation(); - searchBox.CloseResultsWindow(); - document.getElementById("MSearchField").focus(); + if (elem) { + elem.focus(); } - else if (this.lastKey==13) // Enter - { - return true; - } - return false; + } else if (this.lastKey==27) { // Escape + e.stopPropagation(); + searchBox.CloseResultsWindow(); + document.getElementById("MSearchField").focus(); + } else if (this.lastKey==13) { // Enter + return true; } + return false; + } } -function setKeyActions(elem,action) -{ - elem.setAttribute('onkeydown',action); - elem.setAttribute('onkeypress',action); - elem.setAttribute('onkeyup',action); -} +function createResults(resultsPath) { -function setClassAttr(elem,attr) -{ - elem.setAttribute('class',attr); - elem.setAttribute('className',attr); -} + function setKeyActions(elem,action) { + elem.setAttribute('onkeydown',action); + elem.setAttribute('onkeypress',action); + elem.setAttribute('onkeyup',action); + } + + function setClassAttr(elem,attr) { + elem.setAttribute('class',attr); + elem.setAttribute('className',attr); + } -function createResults(resultsPath) -{ - var results = document.getElementById("SRResults"); + const results = document.getElementById("SRResults"); results.innerHTML = ''; - for (var e=0; e { + const id = elem[0]; + const srResult = document.createElement('div'); srResult.setAttribute('id','SR_'+id); setClassAttr(srResult,'SRResult'); - var srEntry = document.createElement('div'); + const srEntry = document.createElement('div'); setClassAttr(srEntry,'SREntry'); - var srLink = document.createElement('a'); - srLink.setAttribute('id','Item'+e); - setKeyActions(srLink,'return searchResults.Nav(event,'+e+')'); + const srLink = document.createElement('a'); + srLink.setAttribute('id','Item'+index); + setKeyActions(srLink,'return searchResults.Nav(event,'+index+')'); setClassAttr(srLink,'SRSymbol'); - srLink.innerHTML = searchData[e][1][0]; + srLink.innerHTML = elem[1][0]; srEntry.appendChild(srLink); - if (searchData[e][1].length==2) // single result - { - srLink.setAttribute('href',resultsPath+searchData[e][1][1][0]); + if (elem[1].length==2) { // single result + srLink.setAttribute('href',resultsPath+elem[1][1][0]); srLink.setAttribute('onclick','searchBox.CloseResultsWindow()'); - if (searchData[e][1][1][1]) - { + if (elem[1][1][1]) { srLink.setAttribute('target','_parent'); - } - else - { + } else { srLink.setAttribute('target','_blank'); } - var srScope = document.createElement('span'); + const srScope = document.createElement('span'); setClassAttr(srScope,'SRScope'); - srScope.innerHTML = searchData[e][1][1][2]; + srScope.innerHTML = elem[1][1][2]; srEntry.appendChild(srScope); - } - else // multiple results - { + } else { // multiple results srLink.setAttribute('href','javascript:searchResults.Toggle("SR_'+id+'")'); - var srChildren = document.createElement('div'); + const srChildren = document.createElement('div'); setClassAttr(srChildren,'SRChildren'); - for (var c=0; c5. :mod:`test_py_module`","3. Paragraph Level Markup","4. Lists & Tables","1. Long Sticky Nav","1. Structural Elements","<no title>","Installation"],titleterms:{"long":23,"public":10,And:21,But:22,For:9,The:[9,21],admonit:21,against:17,agreement:16,amdgpu:10,arg:20,attribut:15,base:19,batch:[8,14],benchmark:11,block:21,build:[9,19],built:17,bullet:22,can:22,caption:22,center:21,chang:17,changelog:18,check:[9,10],citat:21,cla:16,code:[15,21],combin:1,compat:9,compil:9,compound:21,conda:[9,10],conduct:15,configur:19,contain:10,content:[19,20,21,22,23,24],context:19,contribut:16,contributor:16,cpu:[0,2,3,6,7,9,10],creativ:21,cuda:[0,2,3,4,6,7,9,10,11],cudnn:9,data:[7,20],deeper:22,definit:22,develop:[9,17],direct:21,doc:17,docker:[9,10],doctest:21,document:[12,24],down:22,download:[21,26],driver:10,element:24,embed:[0,5,8,14],emphas:21,enforc:15,enumer:22,environ:[9,10],exampl:[21,23],fbgemm:12,fbgemm_gpu:[9,10,11],field:22,figur:21,font:17,footnot:21,gener:20,giant:22,git:26,glibc:9,glossari:21,grid:22,have:22,hlist:22,hole:22,how:19,html:19,ien:21,imag:[9,21,22],index:20,inlin:21,input:1,instal:[9,10,26],instruct:[9,10],isol:9,issu:16,jag:[2,13],layout:3,level:[19,21,22],librari:10,licens:16,like:22,line:21,link:21,list:22,liter:21,local:17,markup:21,math:21,memori:4,menu:23,merg:5,meta:21,miniconda:9,miopen:9,mobil:17,mod:20,nav:23,navig:17,number:[21,22],nvidia:10,one:22,onli:[9,10],oper:[0,1,2,3,4,5,6,7,8,13,14],option:[19,20,22],other:9,our:15,packag:[9,10],page:19,paragraph:[21,24],paramet:20,permut:5,pip:[9,10],pledg:15,pool:5,post:[9,10],prepar:9,process:9,project:19,publish:17,pull:16,pypi:10,python:10,pytorch:[9,10,17],pytorch_sphinx_them:16,quantiz:6,quot:21,rabbit:22,refer:21,replac:21,request:16,respons:15,rocm:[9,10,11],rubric:21,runtim:10,scope:15,second:22,section:24,set:[9,10],sidebar:21,spars:7,sphinx:17,standard:15,sticki:23,structur:24,stylesheet:17,submenu:23,submit:17,subsect:24,subsubmenu:23,subsubsect:24,symbol:[9,10],tabl:[8,14,19,20,21,22,23,24],target:21,tbe:14,tensor:[2,13],test:[11,17],test_py_modul:20,text:21,theme:[17,19],thi:22,through:[9,10],titl:21,toc:19,tool:9,top:17,topic:21,transform:3,tutori:17,undefin:[9,10],variant:11,version:9,via:26,wai:21,welcom:12,wide:19,your:17}}) \ No newline at end of file +Search.setIndex({"docnames": ["cpp-api/embedding_ops", "cpp-api/input_combine", "cpp-api/jagged_tensor_ops", "cpp-api/layout_transform_ops", "cpp-api/memory_utils", "cpp-api/merge_pooled_embeddings", "cpp-api/quantize_ops", "cpp-api/sparse_ops", "cpp-api/split_table_batched_embeddings", "general/BuildInstructions", "general/DocsInstructions", "general/InstallationInstructions", "general/TestInstructions", "index", "python-api/jagged_tensor_ops", "python-api/table_batched_embedding_ops", "pytorch-sphinx-theme/docs/changelog", "pytorch-sphinx-theme/docs/configuring", "pytorch-sphinx-theme/docs/demo/api", "pytorch-sphinx-theme/docs/demo/demo", "pytorch-sphinx-theme/docs/demo/lists_tables", "pytorch-sphinx-theme/docs/demo/long", "pytorch-sphinx-theme/docs/demo/structure", "pytorch-sphinx-theme/docs/index", "pytorch-sphinx-theme/docs/installing"], "filenames": ["cpp-api/embedding_ops.rst", "cpp-api/input_combine.rst", "cpp-api/jagged_tensor_ops.rst", "cpp-api/layout_transform_ops.rst", "cpp-api/memory_utils.rst", "cpp-api/merge_pooled_embeddings.rst", "cpp-api/quantize_ops.rst", "cpp-api/sparse_ops.rst", "cpp-api/split_table_batched_embeddings.rst", "general/BuildInstructions.rst", "general/DocsInstructions.rst", "general/InstallationInstructions.rst", "general/TestInstructions.rst", "index.rst", "python-api/jagged_tensor_ops.rst", "python-api/table_batched_embedding_ops.rst", "pytorch-sphinx-theme/docs/changelog.rst", "pytorch-sphinx-theme/docs/configuring.rst", "pytorch-sphinx-theme/docs/demo/api.rst", "pytorch-sphinx-theme/docs/demo/demo.rst", "pytorch-sphinx-theme/docs/demo/lists_tables.rst", "pytorch-sphinx-theme/docs/demo/long.rst", "pytorch-sphinx-theme/docs/demo/structure.rst", "pytorch-sphinx-theme/docs/index.rst", "pytorch-sphinx-theme/docs/installing.rst"], "titles": ["Embedding Operators", "Combine Input Operators", "Jagged Tensor Operators", "Layout Transformation Operators", "CUDA Memory Operators", "Pooled Embeddings Operators", "Quantization Operators", "Sparse Data Operators", "Table Batched Embedding Operators", "Build Instructions", "Contributing Documentation", "Installation Instructions", "Testing FBGEMM_GPU", "Welcome to FBGEMM\u2019s documentation!", "Jagged Tensor Operators", "Table Batched Embedding (TBE) Operators", "Changelog", "Configuration", "5. :mod:`test_py_module`", "3. Paragraph Level Markup", "4. Lists & Tables", "1. Long Sticky Nav", "1. Structural Elements", "<no title>", "Installation"], "terms": {"tensor": [0, 1, 3, 4, 5, 6, 7, 8, 13, 15], "split_embedding_codegen_lookup_adagrad_funct": 0, "const": [0, 1, 2, 3, 4, 5, 6, 7], "placeholder_autograd_tensor": 0, "dev_weight": [0, 8], "uvm_weight": [0, 8], "lxu_cache_weight": [0, 8], "weights_plac": [0, 8], "weights_offset": [0, 8], "d_offset": [0, 6, 8], "int64_t": [0, 1, 2, 3, 4, 6, 7, 8], "total_d": [0, 8, 15], "max_d": 0, "hash_size_cumsum": [0, 8], "total_hash_size_bit": 0, "indic": [0, 8, 15, 17, 19], "offset": [0, 2, 7, 8, 14, 15], "pooling_mod": [0, 15], "c10": [0, 2, 4, 8], "option": [0, 2, 4, 8, 9, 14, 15, 19, 23], "indice_weight": 0, "feature_requires_grad": [0, 15], "lxu_cache_loc": [0, 8], "bool": [0, 4, 5, 6, 8, 10, 15, 17], "gradient_clip": [0, 15], "doubl": [0, 2, 6, 7, 19], "max_gradi": [0, 15], "stochastic_round": [0, 8, 15], "momentum1_dev": [0, 8], "momentum1_uvm": [0, 8], "momentum1_plac": [0, 8], "momentum1_offset": [0, 8], "ep": [0, 15], "0": [0, 7, 8, 9, 11, 14, 15, 16, 18, 19], "learning_r": [0, 15], "output_dtyp": [0, 6, 15], "static_cast": 0, "sparsetyp": [0, 15], "fp32": [0, 15], "b_offset": 0, "vbe_output_offsets_feature_rank": 0, "vbe_b_offsets_rank_per_featur": 0, "max_b": 0, "1": [0, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 19, 20, 23], "max_b_feature_rank": 0, "vbe_output_s": 0, "is_experiment": 0, "fals": [0, 4, 15, 17, 18, 19, 20], "use_uniq_cache_locations_bwd": 0, "use_homogeneous_plac": 0, "split_embedding_codegen_lookup_adam_funct": 0, "momentum2_dev": 0, "momentum2_uvm": 0, "momentum2_plac": 0, "momentum2_offset": 0, "beta1": [0, 15], "beta2": [0, 15], "weight_decai": [0, 15], "iter": 0, "split_embedding_codegen_lookup_approx_rowwise_adagrad_funct": 0, "weight_decay_mod": [0, 15], "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_funct": 0, "prev_iter_dev": 0, "prev_iter_uvm": 0, "prev_iter_plac": 0, "prev_iter_offset": 0, "row_counter_dev": 0, "row_counter_uvm": 0, "row_counter_plac": 0, "row_counter_offset": 0, "counter_halflif": 0, "adjustment_it": 0, "adjustment_ub": 0, "learning_rate_mod": 0, "grad_sum_decai": 0, "max_count": 0, "tail_id_threshold": 0, "is_tail_id_thresh_ratio": 0, "regularization_mod": 0, "weight_norm_coeffici": 0, "lower_bound": [0, 7], "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_funct": 0, "split_embedding_codegen_lookup_approx_sgd_funct": 0, "split_embedding_codegen_lookup_lamb_funct": 0, "split_embedding_codegen_lookup_lars_sgd_funct": 0, "eta": [0, 15], "momentum": [0, 15], "split_embedding_codegen_lookup_none_funct": 0, "total_hash_s": 0, "total_unique_indic": 0, "split_embedding_codegen_lookup_partial_rowwise_adam_funct": 0, "split_embedding_codegen_lookup_partial_rowwise_lamb_funct": 0, "split_embedding_codegen_lookup_rowwise_adagrad_funct": 0, "max_norm": 0, "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_funct": 0, "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_funct": 0, "split_embedding_codegen_lookup_rowwise_weighted_adagrad_funct": 0, "split_embedding_codegen_lookup_sgd_funct": 0, "void": [0, 4, 6, 8], "bounds_check_indices_cuda": 0, "rows_per_t": 0, "bounds_check_mod": [0, 15], "warn": [0, 10, 15], "weight": [0, 7, 8, 15, 19], "b_ofset": 0, "int_nbit_split_embedding_codegen_lookup_funct": 0, "weights_ti": [0, 8], "max_int2_d": 0, "max_int4_d": 0, "max_int8_d": 0, "max_float16_d": 0, "max_float32_d": 0, "row_align": [0, 8], "max_float8_d": 0, "fp8_exponent_bit": 0, "fp8_exponent_bia": 0, "int_nbit_split_embedding_uvm_caching_codegen_lookup_funct": 0, "cache_hash_size_cumsum": [0, 8], "total_cache_hash_s": [0, 8], "cache_index_table_map": [0, 8], "lxu_cache_st": [0, 8], "lxu_stat": 0, "simlar": 0, "doe": [0, 10, 11, 19], "uvm_cach": 0, "lookup": [0, 8], "pruned_hashmap_lookup_cuda": 0, "hash_tabl": 0, "hash_table_offset": 0, "pruned_array_lookup_cuda": 0, "index_remap": 0, "index_remappings_offset": 0, "int_nbit_split_embedding_codegen_lookup_function_cpu": 0, "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu": 0, "pruned_hashmap_insert_unweighted_cpu": 0, "dense_indic": 0, "pruned_hashmap_lookup_unweighted_cpu": 0, "pruned_array_lookup_cpu": 0, "std": [1, 2, 3, 4, 5, 7, 8], "tupl": [1, 2, 7, 8, 15], "tbe_input_combine_cpu": 1, "vector": [1, 2, 3, 4, 5, 14], "indices_list": 1, "offsets_list": 1, "per_sample_weight": [1, 15], "include_last_offset": 1, "padding_fused_tbe_input_combine_cpu": 1, "batch_siz": 1, "solv": 2, "issu": [2, 4, 9, 11], "when": [2, 7, 9, 10, 15, 17, 19, 21], "row": [2, 8, 14, 15, 20], "dimens": [2, 4, 7, 14, 15], "ar": [2, 8, 9, 10, 11, 14, 15, 17, 19, 20, 21], "differ": [2, 7, 17], "length": [2, 7, 14, 15, 20], "thi": [2, 4, 5, 7, 9, 10, 11, 13, 17, 18, 19, 21, 22], "often": [2, 19], "occur": [2, 10], "spars": [2, 13], "featur": [2, 7, 15], "input": [2, 4, 6, 7, 10, 13, 14, 15, 20], "recommend": [2, 9, 11, 19], "system": [2, 9, 11], "well": [2, 7, 9, 10, 19], "natur": [2, 19], "languag": [2, 18, 19, 20], "process": [2, 11, 18, 19], "batch": [2, 7, 13, 14], "jagged_to_padded_dense_forward": 2, "valu": [2, 4, 6, 7, 8, 10, 14, 15, 17], "symintarrayref": 2, "max_length": [2, 14], "padding_valu": [2, 14], "jagged_dense_elementwise_add_jagged_output_cuda": 2, "x_valu": [2, 14], "x_offset": [2, 14], "y": [2, 9, 10, 11, 14, 20], "output": [2, 6, 7, 10, 14, 15, 17, 19, 20], "x": [2, 10, 20], "where": [2, 4, 7, 14, 15, 20], "i": [2, 4, 6, 7, 8, 9, 10, 11, 14, 15, 17, 18, 19, 21, 22], "dens": [2, 14], "jagged_to_padded_dens": [2, 13, 14], "jagged_dense_elementwise_add": [2, 13, 14], "jagged_dense_elementwise_mul": [2, 13, 14], "batched_dense_vec_jagged_2d_mul": [2, 13, 14], "v": [2, 12, 14, 20], "a_valu": [2, 14], "a_offset": [2, 14], "dense_to_jag": [2, 13, 14], "symint": 2, "total_l": [2, 14], "jagged_dense_elementwise_add_jagged_output": [2, 13, 14], "jagged_1d_to_dens": [2, 13, 14], "max_l": 2, "jagged_2d_to_dens": [2, 9, 11, 13, 14], "max_sequence_length": [2, 14], "recat_embedding_grad_output_cuda": 3, "grad_output": 3, "num_features_per_rank": 3, "recat_embedding_grad_output_mixed_d_cuda": 3, "dim_sum_per_rank": 3, "recat_embedding_grad_output_mixed_d_batch_cuda": 3, "cumsum_dim_sum_per_rank": 3, "recat_embedding_grad_output_mixed_d_cpu": 3, "new_managed_tensor": 4, "self": 4, "size": [4, 6, 7, 14, 15, 17], "alloc": 4, "an": [4, 7, 10, 11, 13, 15, 17, 19, 20], "unifi": 4, "manag": [4, 9, 11, 15], "uvm": [4, 12], "Then": 4, "set": [4, 8, 12, 13, 14, 15, 17, 21, 24], "its": [4, 7, 9, 10, 15, 19], "prefer": [4, 11], "storag": [4, 6, 8], "locat": [4, 8, 9], "cpu": [4, 5, 10, 12, 13], "host": 4, "establish": 4, "map": [4, 7, 8, 15], "devic": [4, 5, 9, 12, 15], "paramet": [4, 7, 10, 14, 15, 23], "The": [4, 7, 10, 11, 12, 14, 15, 17, 20], "target": [4, 7, 9], "return": [4, 7, 10, 14, 15], "A": [4, 10, 11, 14, 15, 20], "new": [4, 8, 10, 19], "back": [4, 8, 9, 11, 19], "new_managed_tensor_meta": 4, "placehold": 4, "meta": [4, 23], "dispatch": 4, "kei": [4, 19], "empti": [4, 14, 20], "new_host_mapped_tensor": 4, "new_unified_tensor": 4, "is_host_map": 4, "either": [4, 7, 9, 11, 17, 19], "whether": [4, 9, 18], "depend": [4, 9, 11], "new_vanilla_managed_tensor": 4, "allow": [4, 17], "automat": [4, 7, 10, 12, 19], "uvm_storag": 4, "check": [4, 13, 15], "gpu": [4, 9, 11, 12], "true": [4, 15, 17, 20], "otherwis": [4, 11], "is_uvm_tensor": 4, "BUT": 4, "non": [4, 15, 20, 22], "uvm_to_cpu": 4, "convert": [4, 6, 14], "effect": [4, 17], "move": 4, "from": [4, 7, 8, 9, 10, 11, 12, 15, 17, 18, 19, 20], "uvm_to_devic": 4, "prototyp": 4, "creat": [4, 9, 10, 19], "share": 4, "same": [4, 7, 9, 10, 14, 17, 19], "whose": 4, "uvm_cuda_mem_advis": 4, "cuda_memory_advis": 4, "call": [4, 11, 19], "cudamemadvis": 4, "": [4, 9, 10, 12, 17, 19, 20], "cudamemoryadvis": 4, "enum": 4, "avail": [4, 9, 10, 17], "python": [4, 9, 12, 19, 20], "side": [4, 17], "fbgemm_gpu": [4, 15], "namespac": 4, "see": [4, 9, 10, 11, 19], "document": [4, 17, 19, 20, 24], "over": [4, 9, 19], "valid": 4, "here": [4, 9, 10, 17, 19, 20], "For": [4, 10, 12, 17, 19, 20], "more": [4, 9, 10, 15, 17, 20], "inform": [4, 10, 19], "integ": [4, 22], "uvm_cuda_mem_prefetch_async": 4, "device_t": 4, "cudamemprefetchasync": 4, "prefetch": 4, "destin": 4, "uvm_mem_advice_dont_fork": 4, "madvis": 4, "madv_dontfork": 4, "workaround": 4, "kernel": [4, 12], "driver": [4, 9], "un": 4, "page": [4, 21, 23], "tabl": [4, 7, 13, 23], "fork": 4, "caus": [4, 9, 11, 17], "slowdown": 4, "next": [4, 17, 19, 20], "access": [4, 15, 17], "uvm_to_cpu_clon": 4, "copi": [4, 20], "contigu": [4, 7], "t": [4, 7, 9, 10, 15, 17, 19, 20], "us": [4, 7, 9, 10, 12, 15, 17, 18, 19, 20], "singl": [4, 19], "thread": 4, "memcpi": 4, "contain": [4, 9, 14, 15, 19, 20], "data": [4, 13, 15, 19, 23], "section": [5, 9, 17, 19, 20, 21, 23], "includ": [5, 9, 10, 17, 19], "cuda": [5, 13, 15], "variou": 5, "all_to_one_devic": 5, "inputtensor": 5, "target_devic": 5, "permute_pooled_embs_split_gpu": 5, "pooled_emb": 5, "offset_dim_list": 5, "permute_list": 5, "inv_offset_dim_list": 5, "inv_permute_list": 5, "permute_pooled_embs_auto_grad_split_gpu": 5, "permute_pooled_embs_auto_grad_gpu": 5, "permute_pooled_embs_cpu_impl": 5, "allow_dupl": 5, "permute_pooled_embs_split_cpu": 5, "permute_pooled_embs_auto_grad_split_cpu": 5, "permute_pooled_embs_auto_grad": 5, "permute_pooled_embs_auto_grad_cpu": 5, "model": [6, 7], "optim": [6, 15], "techniqu": 6, "reduc": 6, "larg": [6, 17, 19], "order": 6, "achiev": [6, 11], "better": [6, 10], "perform": [6, 7], "small": 6, "loss": 6, "accuraci": 6, "dll_public": [6, 7], "_float_to_bfloat16_gpu": 6, "float": [6, 10, 14, 15], "brain": 6, "point": [6, 14, 17, 18, 19], "bfloat16": 6, "_bfloat16_to_float_gpu": 6, "_float_to_fp8rowwise_gpu": 6, "forward": 6, "_float_to_fused8bitrowwise_gpu": 6, "_single_or_half_precision_to_fused8bitrowwise_gpu": 6, "_fused8bitrowwise_to_single_or_half_precision_gpu": 6, "_fused8bitrowwise_to_float_mixed_dim_gpu": 6, "templat": [6, 9, 17], "typenam": 6, "input_t": 6, "_float_to_fusednbitrowwise_gpu_t": 6, "bit_rat": [6, 7], "_float_to_fusednbitrowwise_gpu": 6, "_half_to_fusednbitrowwise_gpu": 6, "output_t": 6, "_fusednbitrowwise_to_float_gpu_t": 6, "_fusednbitrowwise_to_half_gpu": 6, "_fusednbitrowwise_to_float_or_half_gpu": 6, "_float_to_hfp8_gpu": 6, "ebit": 6, "exponent_bia": 6, "max_po": 6, "hybrid": 6, "8": [6, 9, 15, 19, 20, 23], "bit": 6, "hfp8": 6, "_hfp8_to_float_gpu": 6, "_float_to_msfp_gpu": 6, "bounding_box_s": 6, "mbit": 6, "bia": 6, "min_po": 6, "microsoft": 6, "msfp": 6, "_msfp_to_float_gpu": 6, "_float_to_paddedfp8rowwise_gpu": 6, "row_dim": 6, "_fused8bitrowwise_to_float_cpu_out": 6, "_float_to_fused8bitrowwise_cpu_out": 6, "float_to_fused8bitrowwise_cpu": 6, "half_to_fused8bitrowwise_cpu": 6, "float_or_half_to_fused8bitrowwise_cpu": 6, "fused8bitrowwise_to_float_cpu": 6, "fused8bitrowwise_to_half_cpu": 6, "fused8bitrowwise_to_float_or_half_cpu": 6, "float_to_fp8rowwise_cpu": 6, "fp8rowwise_to_float_cpu": 6, "fusednbitrowwise_to_float_cpu": 6, "fusednbitrowwise_to_half_cpu": 6, "fusednbitrowwise_to_float_or_half_cpu": 6, "floattofp8quantized_ref": 6, "size_t": 6, "nrow": 6, "ncol": 6, "uint8_t": [6, 8], "int": [6, 10, 14, 15, 17], "fp8quantizedtofloat_ref": 6, "expand_into_jagged_permute_cuda": 7, "permut": [7, 13], "input_offset": 7, "output_offset": 7, "output_s": 7, "expand_into_jagged_permut": 7, "expand": 7, "index": [7, 8, 9, 10, 11, 17, 23], "case": [7, 9, 11], "ha": [7, 10, 11, 19, 20], "across": [7, 9, 17], "rank": [7, 17], "level": [7, 23], "exclus": 7, "op": [7, 11, 14], "each": [7, 9, 10, 14, 15, 17, 19], "bag": [7, 15], "correspond": [7, 8, 10], "posit": [7, 15, 17], "sit": [7, 20, 22], "after": [7, 9, 10, 11, 12, 15], "we": [7, 17, 19], "deriv": 7, "arrai": [7, 14], "comput": [7, 9, 11, 15, 19], "follow": [7, 9, 10, 11, 17, 19, 20, 24], "formula": 7, "output_permut": 7, "table_offset": 7, "bag_offset": 7, "_float_or_half_to_fusednbitrowwise_gpu": 7, "histogram_binning_calibration_cpu": 7, "logit": 7, "bin_num_exampl": 7, "bin_num_posit": 7, "positive_weight": 7, "upper_bound": 7, "bin_ctr_in_use_aft": 7, "bin_ctr_weight_valu": 7, "divid": 7, "predict": 7, "rang": 7, "e": [7, 9], "g": [7, 9, 19], "b": [7, 9, 10, 14, 15, 20], "bin": [7, 9], "In": [7, 10, 11, 19, 20, 22], "two": [7, 14, 15, 19, 20, 24], "store": [7, 8], "number": [7, 9, 10, 14, 15, 17], "exampl": [7, 9, 10, 11, 12, 14, 15, 17, 18, 20, 23], "fall": [7, 9, 11], "bucket": [7, 9], "so": [7, 9, 11, 18, 19, 20], "basic": [7, 20], "have": [7, 8, 19], "histogram": 7, "As": [7, 9, 10, 11], "result": [7, 14, 17], "statist": 7, "real": 7, "ctr": 7, "num_po": 7, "num_exampl": 7, "final": 7, "calibr": 7, "pre": [7, 9, 11], "cali": 7, "wai": 7, "within": [7, 17, 19], "should": [7, 8, 10, 11, 17, 19, 22], "suffici": [7, 10], "That": [7, 19], "fine": 7, "grain": 7, "modul": [7, 10, 11, 15, 18, 19, 20], "theoret": 7, "layer": 7, "can": [7, 9, 10, 11, 17, 19], "fix": [7, 19], "ani": [7, 10, 14, 17, 19], "uncalibr": 7, "befor": [7, 15], "appli": [7, 9, 15], "sigmoid": 7, "assum": 7, "calibart": 7, "pass": [7, 10, 15], "argument": [7, 10, 19, 20], "all": [7, 8, 9, 10, 11, 15, 19, 20], "which": [7, 9, 10, 11, 15, 17, 19, 20], "lower": [7, 20], "bound": 7, "calibration_target": 7, "don": [7, 9, 10, 17, 19, 20], "onli": [7, 8, 10, 12, 13, 19], "observ": 7, "default": [7, 9, 11, 15, 17, 18, 19], "specifi": [7, 9, 14, 15, 17, 19], "sum": [7, 14, 15], "statisct": 7, "final_calibrated_predict": 7, "bin_ctr_weight": 7, "bin_ctr": 7, "calibrated_predict": 7, "bin_id": 7, "generic_histogram_binning_calibration_by_feature_cpu": 7, "segment_valu": 7, "segment_length": 7, "num_seg": 7, "bin_boundari": 7, "extens": [7, 10], "base": [7, 8, 9, 19], "one": [7, 8, 10, 14, 15, 19], "specif": [7, 9, 10, 15, 19], "ectr": 7, "abov": [7, 10, 11, 18, 19], "accept": 7, "gener": [7, 9, 11, 17, 19, 20, 23], "sort": [7, 8, 9], "keyjaggedtensor": 7, "num_bin": 7, "longer": [7, 17], "still": [7, 9], "parambin_ctr_weight_valu": 7, "get_unique_indices_cuda": 8, "linear_indic": 8, "max_indic": 8, "compute_count": 8, "dedupl": 8, "pair": [8, 19], "lru_cache_find_uncached_cuda": 8, "unique_indic": 8, "unique_indices_length": 8, "time_stamp": 8, "lru_stat": 8, "gather_cache_stat": 8, "uvm_cache_stat": 8, "lock_cache_lin": 8, "lxu_cache_locking_count": 8, "lru": [8, 15], "cach": [8, 9, 15], "find": [8, 9, 10], "uncach": 8, "them": [8, 10, 17], "host_lxu_cache_slot": 8, "h_in": 8, "c": [8, 11, 20], "cache_set": [8, 15], "linearize_cache_indices_cuda": 8, "linear": 8, "make": [8, 9, 10, 19, 21], "uniqu": 8, "linearize_cache_indices_from_row_idx_cuda": 8, "update_table_indic": 8, "update_row_indic": 8, "note": [8, 9, 10, 11, 17, 19], "format": [8, 10, 19, 20], "inplac": 8, "updat": [8, 9, 11, 15], "lru_cache_populate_cuda": 8, "linear_cache_indic": 8, "fetch": 8, "insert": 8, "timestep": 8, "lru_cache_populate_byte_cuda": 8, "byte": 8, "element": [8, 19, 20, 23], "direct_mapped_lru_cache_populate_byte_cuda": 8, "lxu_cache_miss_timestamp": 8, "direct": [8, 10, 11, 23], "assoc": 8, "variant": [8, 9, 10, 11], "lfu_cache_populate_cuda": 8, "lfu_stat": 8, "lfu": [8, 15], "lfu_cache_populate_byte_cuda": 8, "lxu_cache_lookup_cuda": 8, "invalid_index": 8, "num_uniq_cache_indic": 8, "lxu_cache_locations_output": 8, "look": [8, 10, 15], "up": [8, 13, 15, 19], "slot": 8, "sentinel": 8, "miss": [8, 9, 19], "direct_mapped_lxu_cache_lookup_cuda": 8, "lxu_cache_flush_cuda": 8, "flush": [8, 19], "reset_weight_momentum_cuda": 8, "pruned_indic": 8, "pruned_indices_offset": 8, "logical_table_id": 8, "buffer_id": 8, "lxu_cache_locking_counter_decrement_cuda": 8, "decrement": 8, "counter": 8, "lxu_cache_locations_update_cuda": 8, "lxu_cache_locations_new": 8, "most": [9, 10, 11], "date": [9, 10, 11, 20], "embed": [9, 10, 11, 13, 19], "script": [9, 10, 11], "bundl": [9, 10, 11], "repo": [9, 10, 11], "under": [9, 10, 11, 19], "setup_env": [9, 10, 11], "bash": [9, 10, 11], "step": [9, 10, 11], "toolchain": [9, 11], "run": [9, 10, 11, 12, 19], "reproduc": [9, 11], "export": [9, 12], "platform_nam": 9, "unam": 9, "m": [9, 11, 12, 20], "prefix": [9, 19], "directori": [9, 10, 12], "miniconda_prefix": 9, "home": 9, "download": [9, 11, 23], "wget": 9, "q": 9, "http": [9, 10, 11, 17, 19], "anaconda": 9, "com": 9, "miniconda3": 9, "latest": [9, 17], "sh": 9, "o": [9, 11, 19], "p": 9, "u": 9, "load": 9, "shortcut": 9, "bashrc": 9, "n": [9, 11], "out": [9, 19], "command": [9, 10, 11, 19, 20], "against": [9, 12], "insid": [9, 10, 11, 19], "env_nam": [9, 11], "env": [9, 11], "name": [9, 10, 11, 19, 20], "python_vers": 9, "3": [9, 14, 15, 19, 20, 23], "12": [9, 15, 19, 23], "upgrad": 9, "pyopenssl": 9, "22": 9, "requir": [9, 10, 11, 15, 17], "recent": [9, 11], "nvcc": 9, "support": [9, 10, 11, 17, 18], "capabl": [9, 12], "5": [9, 15, 19, 20, 23], "machin": [9, 11, 12], "done": [9, 11], "built": [9, 10, 11], "bare": 9, "metal": 9, "neither": 9, "nor": 9, "nvidia": 9, "need": [9, 10, 11, 12, 19], "present": 9, "sinc": [9, 19], "thei": [9, 10, 17, 19], "runtim": 9, "setup": [9, 11], "simpli": 9, "pull": [9, 10, 11], "desir": [9, 14, 20], "linux": [9, 11], "distribut": 9, "ubuntu": 9, "04": 9, "11": [9, 11, 19, 23], "entrypoint": 9, "devel": 9, "ubuntu22": 9, "rest": [9, 11, 19, 20], "mai": [9, 11, 19, 20], "construct": [9, 11, 19, 20], "mechan": 9, "full": [9, 11], "nvml": 9, "cuda_vers": 9, "7": [9, 11, 14, 15, 23], "label": [9, 19], "verifi": [9, 10, 11], "cuda_runtim": 9, "h": [9, 10, 14], "libnvidia": [9, 11], "ml": [9, 11], "found": [9, 10, 11, 19], "conda_prefix": 9, "printenv": 9, "time": [9, 11, 17, 19], "extract": 9, "given": [9, 14], "url": [9, 11, 17], "platform": 9, "github": [9, 17], "builder": 9, "blob": [9, 17], "main": [9, 17, 19, 21], "common": [9, 11, 19], "install_cuda": 9, "cudnn_url": 9, "redist": 9, "v8": 9, "local_instal": 9, "x86_64": 9, "84_cuda11": 9, "archiv": 9, "tar": [9, 19], "xz": 9, "unpack": 9, "amd": [9, 11], "minim": 9, "6": [9, 11, 23], "termin": 9, "both": [9, 17, 19], "minimum": [9, 10], "oper": [9, 11, 13], "guid": [9, 10, 18], "disabl": [9, 17, 18], "apt": 9, "prompt": 9, "debian_frontend": 9, "noninteract": 9, "db": 9, "radeon": 9, "amdgpu": 9, "focal": 9, "install_5": 9, "4": [9, 11, 14, 15, 17, 19, 20, 23], "50403": 9, "1_all": 9, "deb": 9, "usecas": 9, "hiplibsdk": 9, "dkm": 9, "hipifi": 9, "clang": 9, "hip": 9, "dev": 9, "gcc": 9, "17": [9, 23], "oppos": 9, "becaus": 9, "reli": 9, "path": [9, 10], "sysroot": 9, "also": [9, 15, 17, 19, 20], "avoid": 9, "glibcxx": 9, "fbgemm_cpu": 9, "gxx_linux": 9, "64": 9, "10": [9, 11, 23], "sysroot_linux": 9, "2": [9, 10, 11, 14, 15, 17, 19, 20, 23], "forg": [9, 10], "while": 9, "newer": 9, "binari": 9, "older": [9, 11], "20": [9, 23], "cento": 9, "stream": 9, "librari": [9, 13], "refer": [9, 10, 13, 18, 23], "libstdc": 9, "To": [9, 10, 12, 17], "what": [9, 10, 19], "libcxx_path": 9, "print": [9, 10, 11, 15, 19], "objdump": 9, "tc": 9, "grep": 9, "glibc_": 9, "sed": [9, 20, 22], "9": [9, 15, 23], "vu": 9, "cat": 9, "glibcxx_": 9, "necessari": 9, "ninja": 9, "cmake": 9, "etc": [9, 15, 17, 19], "click": 9, "hypothesi": [9, 11], "jinja2": 9, "numpi": [9, 11], "scikit": [9, 11], "wheel": 9, "offici": 9, "homepag": 9, "authorit": [9, 10, 11], "how": [9, 10, 11, 12, 19, 21, 23], "nightli": [9, 11], "test": [9, 11, 13, 19, 20], "rc": 9, "without": [9, 17, 19], "alwai": [9, 19], "reliabl": 9, "known": [9, 15], "arriv": 9, "hour": 9, "later": 9, "than": [9, 18, 19], "window": [9, 19], "silent": 9, "place": [9, 15, 19, 20, 21], "artifact": 9, "select": [9, 18, 21], "dure": [9, 15], "thu": [9, 15, 19], "import": [9, 11, 15, 17, 19], "first": [9, 10, 19, 20], "prior": [9, 11], "much": [9, 17, 19], "determinist": 9, "torch": [9, 11, 14, 15], "org": [9, 11, 17, 19], "whl": [9, 11], "cu121": [9, 11], "rocm5": [9, 11], "channel": [9, 11, 18], "write": [9, 10, 11, 19], "ensur": [9, 11], "properli": 9, "__version__": 9, "cuda_cmake_macro": 9, "clone": 9, "along": [9, 11], "submodul": 9, "txt": [9, 10], "tag": [9, 10, 19], "fbgemm_vers": 9, "v0": [9, 16], "git": [9, 23], "recurs": 9, "fbgemm": [9, 10, 11, 14], "fbgemm_": 9, "addit": [9, 14], "cd": [9, 10, 12], "flow": [9, 19], "keep": 9, "state": 9, "becom": 9, "stale": 9, "problem": [9, 19], "re": [9, 11, 19], "attempt": 9, "failur": [9, 11], "due": [9, 18, 19], "address": [9, 20], "clear": 9, "py": [9, 10, 11, 12, 17, 19, 24], "clean": [9, 19], "made": [9, 10, 19], "variabl": 9, "presenc": 9, "howev": [9, 17, 18], "determin": 9, "processor": 9, "architectur": [9, 11], "arch": 9, "unabl": 9, "cudacxx": 9, "cuda_bin_path": 9, "provid": [9, 10, 11, 12, 13, 19], "cub": 9, "applic": [9, 15], "cub_dir": 9, "header": [9, 10, 17, 19, 20], "cudnn_include_dir": 9, "cudnn_librari": 9, "lib": [9, 11], "nvml_lib_path": 9, "reflect": 9, "python_tag": 9, "py310": 9, "package_nam": 9, "sm70": [9, 11], "80": 9, "v100": [9, 11], "a100": [9, 11], "If": [9, 10, 11, 15, 17, 19], "current": [9, 11, 15, 17], "cuda_arch_list": 9, "unset": 9, "torch_cuda_arch_list": 9, "exist": [9, 10], "bc": 9, "take": [9, 17, 19], "preced": [9, 19], "dtorch_cuda_arch_list": 9, "invoc": [9, 10], "bdist_wheel": 9, "package_vari": 9, "plat": 9, "manylinux1_": 9, "rocm_path": 9, "pytorch_rocm_arch": 9, "gfx906": 9, "gfx908": 9, "gfx90a": 9, "wiki": 9, "gentoo": 9, "list": [9, 10, 14, 15, 23], "rocminfo": 9, "gfx": 9, "fbgemm_gpu_rocm": 9, "dhip_root_dir": 9, "dcmake_c_flag": 9, "dtorch_use_hip_dsa": 9, "dcmake_cxx_flag": 9, "cpu_onli": 9, "flag": 9, "fbgemm_gpu_cpu": 9, "complet": [9, 10], "some": [9, 10, 18, 19], "actual": 9, "correct": [9, 17], "lot": 9, "jinja": 9, "instanti": 9, "sure": [9, 10], "accident": 9, "cours": 9, "file": [9, 10, 11, 17, 18, 20, 24], "fbgemm_gpu_lib_path": 9, "fbgemm_gpu_pi": [9, 11], "defin": [9, 10, 17], "function": [9, 10], "nm": 9, "gdcu": 9, "It": [9, 11, 17, 19], "referenc": 9, "certain": 9, "must": [9, 11, 12, 15, 19, 20], "gdc": 9, "merge_pooled_embed": [9, 11], "comment": [10, 19], "sourc": 10, "packag": [10, 12, 13], "instruct": [10, 13], "isol": [10, 11, 13], "conda": 10, "correctli": [10, 11], "instal": [10, 12, 13, 23], "tool": [10, 13], "doc": [10, 17, 18, 19, 24], "sphinx": [10, 17, 18, 19, 20, 24], "other": [10, 11, 18, 19], "pip": 10, "r": [10, 19], "doxygen": 10, "assembl": 10, "togeth": 10, "html": [10, 19], "view": [10, 17], "serv": 10, "pytorch": [10, 13], "project": [10, 18, 23], "deploi": 10, "netlifi": 10, "request": [10, 15], "pr": [10, 11], "app": 10, "public": [10, 20], "method": [10, 19], "accompani": 10, "itself": 10, "put": [10, 19], "yourself": 10, "shoe": 10, "develop": [10, 11, 19, 20], "who": [10, 19], "understand": 10, "your": [10, 17, 19, 20, 24], "live": [10, 19], "easier": 10, "leav": 10, "docstr": [10, 19, 20], "separ": 10, "task": 10, "At": [10, 18], "veri": [10, 20, 21], "add": [10, 14, 17, 19, 24], "descript": [10, 20], "usag": [10, 11, 19], "link": [10, 11, 17, 18, 23], "limit": [10, 18], "through": [10, 19], "pleas": [10, 19], "googl": [10, 17], "style": [10, 17, 19, 20], "def": [10, 19], "example_funct": 10, "class": [10, 19, 20], "you": [10, 17, 19, 20, 21], "multipl": [10, 14, 15, 19, 20], "line": [10, 11, 20], "those": [10, 14, 19], "about": [10, 17], "arg": [10, 14, 23], "arg1": 10, "rais": [10, 11], "attributeerror": [10, 11], "error": [10, 11, 19], "block": [10, 23], "publish": 10, "rst": [10, 17, 19], "local": [10, 17], "chang": [10, 17], "submit": 10, "javadoc": 10, "breath": 10, "kept": [10, 20], "cpp": 10, "cu": 10, "cuh": 10, "everyth": 10, "between": [10, 19, 20], "ifndef": 10, "doxygen_this_will_be_skip": 10, "endif": 10, "hidden": [10, 17], "moment": 10, "undocu": 10, "descriptionss": 10, "configur": [10, 23], "group": [10, 19, 20], "organ": [10, 20], "defgroup": 10, "verbatim": 10, "param": [10, 18], "ingroup": 10, "example_method": 10, "foo": [10, 19, 20], "lst": 10, "param1": 10, "param2": 10, "throw": 10, "my_error": 10, "info": [10, 11], "href": 10, "www": [10, 17, 19], "nl": 10, "manual": [10, 11, 18, 19], "cmdlink": 10, "int32_t": 10, "bar": [10, 17, 19], "doxygengroup": 10, "alreadi": [10, 11], "content": [10, 23], "toctre": [10, 17], "ini": 10, "append": 10, "build": [11, 12, 13, 18, 23], "work": [11, 19, 20], "version": [11, 17, 20], "sm80": 11, "respect": 11, "scratch": 11, "guarante": 11, "especi": 11, "displai": [11, 17], "do": [11, 18, 20], "smi": 11, "515": 11, "76": 11, "persist": [11, 19], "bu": 11, "id": [11, 17, 18, 22], "disp": 11, "volatil": 11, "uncorr": 11, "ecc": 11, "fan": 11, "temp": 11, "perf": 11, "pwr": 11, "cap": 11, "memori": [11, 13, 15], "util": 11, "mig": 11, "a10g": 11, "off": 11, "00000000": 11, "00": 11, "1e": 11, "31c": 11, "p0": 11, "59w": 11, "300w": 11, "0mib": 11, "23028mib": 11, "gi": 11, "ci": 11, "pid": 11, "type": [11, 14, 15, 19], "No": 11, "though": 11, "expos": 11, "detail": 11, "onc": 11, "imag": 11, "launch": 11, "toolkit": 11, "interfac": 11, "concis": 11, "dieedg": 11, "avgpwr": 11, "sclk": 11, "mclk": 11, "pwrcap": 11, "vram": 11, "33": 11, "0c": 11, "37": 11, "0w": 11, "300mhz": 11, "1200mhz": 11, "auto": [11, 19, 20], "290": 11, "32": 11, "39": 11, "end": [11, 19], "log": 11, "difficult": 11, "relev": 11, "releas": [11, 19], "encount": 11, "signatur": 11, "traceback": 11, "last": 11, "root": [11, 17], "miniconda": 11, "mycondaenv": 11, "python3": 11, "site": [11, 19], "_op": 11, "565": 11, "__getattr__": 11, "overload_nam": 11, "_c": 11, "_jit_get_oper": 11, "qualified_op_nam": 11, "runtimeerror": 11, "except": 11, "wa": [11, 19], "string": [11, 17], "post47": 11, "py3": 11, "aarch64": 11, "egg": 11, "__init__": [11, 19], "21": 11, "_fbgemm_gpu_doc": 11, "noqa": 11, "f401": 11, "e402": 11, "18": [11, 23], "569": 11, "_opnamespac": 11, "object": 11, "attribut": [11, 19], "cli": 11, "main_run": 11, "execut": [11, 12], "47": 11, "fail": [11, 12], "_zn6fbgemm48floatorhalftofusednbitrowwisequantizedsbhalfavx2itli2eeevpkt_miph": 11, "appear": [11, 19], "reason": 11, "libtorch": 11, "visibl": 11, "ld_library_path": 11, "incorrectli": 11, "declar": 11, "were": [11, 14], "1618": 11, "former": 11, "resolv": 11, "latter": 11, "seriou": 11, "tha": 11, "report": 11, "directoi": 12, "bench": 12, "good": [12, 19], "pytest": 12, "rsx": 12, "w": 12, "ignor": [12, 15], "pytestcollectionwarn": 12, "split_table_batched_embeddings_test": 12, "quantize_ops_test": 12, "sparse_ops_test": 12, "split_embedding_inference_converter_test": 12, "detect": 12, "mode": [12, 15], "cuda_visible_devic": 12, "environ": [12, 13], "enabl": [12, 17], "debug": 12, "cuda_launch_block": 12, "fbgemm_test_with_rocm": 12, "hip_launch_block": 12, "split_table_batched_embeddings_benchmark": 12, "comprehens": 13, "rocm": 13, "post": 13, "benchmark": 13, "contribut": 13, "guidelin": 13, "ad": 13, "code": [13, 18, 20], "tbe": 13, "splittablebatchedembeddingbagscodegen": [13, 15], "jag": 13, "jagged_dense_dense_elementwise_add_jagged_output": [13, 14], "stacked_jagged_1d_to_dens": [13, 14], "stacked_jagged_2d_to_dens": [13, 14], "quantiz": 13, "pool": [13, 15], "merg": 13, "combin": 13, "layout": 13, "transform": 13, "2d": [14, 15], "pad": 14, "zero": 14, "1d": [14, 15], "start": [14, 20], "maximum": 14, "area": [14, 21], "outsid": [14, 19], "coverag": 14, "total": [14, 15], "identit": 14, "purpos": [14, 15], "structur": [14, 17, 19, 23], "y_0": 14, "y_1": 14, "elementwis": 14, "multipli": [14, 15], "matrix": 14, "max_n": 14, "d": [14, 19, 20], "matmul": 14, "kwarg": 14, "split_table_batched_embeddings_op": 15, "embedding_spec": 15, "feature_table_map": 15, "none": [15, 17, 19], "cache_algorithm": 15, "cachealgorithm": 15, "cache_load_factor": 15, "cache_reserved_memori": 15, "cache_precis": 15, "weights_precis": 15, "enforce_hbm": 15, "optimtyp": 15, "exact_sgd": 15, "record_cache_metr": 15, "01": [15, 20], "0e": 15, "weightdecaymod": 15, "001": 15, "999": 15, "poolingmod": 15, "boundscheckmod": 15, "train": 15, "backward": 15, "fuse": 15, "embeddingloc": 15, "computedevic": 15, "spec": 15, "placement": 15, "lxu": 15, "algorithm": 15, "capac": 15, "amount": 15, "reserv": 15, "hbm": 15, "fp16": 15, "int8": 15, "adam": 15, "exact_adagrad": 15, "exact_rowwise_adagrad": 15, "exact_rowwise_weighted_adagrad": 15, "lamb": 15, "lars_sgd": 15, "partial_rowwise_adam": 15, "partial_rowwise_lamb": 15, "sgd": 15, "recordcachemetr": 15, "record": 15, "hit": 15, "record_cache_miss_count": 15, "similar": 15, "metric": 15, "wise": 15, "record_tablewise_cache_miss": 15, "stochast": 15, "round": 15, "gradient": 15, "clip": 15, "learn": 15, "rate": 15, "epsilon": 15, "adagrad": 15, "lar": 15, "decai": 15, "rowwis": 15, "l2": 15, "decoupl": 15, "mean": 15, "boundari": 15, "fatal": 15, "conatin": 15, "shape": 15, "max": [15, 17], "column": [15, 20], "read": [15, 17, 19], "split_table_batched_embeddings_ops_common": 15, "split_table_batched_embeddings_ops_train": 15, "init_embedding_weights_uniform": 15, "split_embedding_weight": 15, "9426": 15, "7046": 15, "4214": 15, "0419": 15, "1331": 15, "7856": 15, "8124": 15, "2021": 15, "5771": 15, "5911": 15, "7792": 15, "1068": 15, "6203": 15, "4813": 15, "1677": 15, "4790": 15, "5587": 15, "0941": 15, "5754": 15, "3475": 15, "8952": 15, "1964": 15, "0810": 15, "4174": 15, "2513": 15, "4039": 15, "3775": 15, "3273": 15, "5399": 15, "0229": 15, "1455": 15, "8770": 15, "9520": 15, "4593": 15, "7169": 15, "6307": 15, "1765": 15, "8757": 15, "8614": 15, "2051": 15, "0603": 15, "9980": 15, "7958": 15, "5826": 15, "dtype": 15, "long": [15, 19, 20], "13": [15, 19, 23], "5197": 15, "2957": 15, "3578": 15, "1487": 15, "4873": 15, "3044": 15, "9801": 15, "2769": 15, "7164": 15, "8528": 15, "7159": 15, "6719": 15, "0784": 15, "2016": 15, "2176": 15, "1988": 15, "3825": 15, "5008": 15, "8991": 15, "1405": 15, "2637": 15, "9427": 15, "8902": 15, "3754": 15, "5013": 15, "6105": 15, "9968": 15, "3057": 15, "7621": 15, "9821": 15, "7314": 15, "6195": 15, "grad_fn": 15, "cppnode": 15, "splitlookupfunction_sgd_op": 15, "part": [17, 18, 19], "pytorch_sphinx_them": [17, 24], "conf": [17, 24], "repositori": [17, 24], "via": [17, 23], "html_theme_opt": 17, "canonical_url": 17, "analytics_id": 17, "logo_onli": 17, "display_vers": 17, "prev_next_buttons_loc": 17, "bottom": 17, "style_external_link": 17, "vcs_pageview_mod": 17, "collapse_navig": 17, "sticky_navig": [17, 21], "navigation_depth": 17, "includehidden": 17, "titles_onli": 17, "canon": 17, "let": [17, 19], "search": 17, "engin": 17, "know": [17, 19], "give": [17, 19], "higher": [17, 19], "trail": 17, "slash": 17, "analyt": 17, "With": [17, 19], "isn": [17, 19], "shown": [17, 19], "top": [17, 21], "sidebar": [17, 23], "previou": 17, "button": [17, 19], "accordingli": 17, "icon": [17, 19], "extern": [17, 19], "display_github": 17, "display_gitlab": 17, "gitlab": 17, "edit": 17, "raw": 17, "bitbucket": 17, "These": [17, 19], "en": 17, "stabl": 17, "lose": 17, "drop": 17, "down": 17, "scroll": [17, 21], "depth": 17, "tree": 17, "unlimit": 17, "mark": 17, "remov": 17, "high": 17, "mani": [17, 19, 20], "deep": 17, "significantli": 17, "larger": 17, "compil": 17, "todo": 17, "metadata": 17, "render": 17, "github_url": 17, "forc": 17, "bitbucket_url": 17, "gitlab_url": 17, "left": [17, 19], "menu": [17, 19], "upon": 17, "visitor": 17, "revert": 17, "usual": 17, "misbuild": 17, "might": 17, "show": [17, 19], "properti": 17, "By": 17, "navig": 17, "stick": 17, "screen": 17, "vertic": [17, 19], "too": [17, 19, 20], "static": 17, "sticki": [17, 23], "nav": [17, 23], "altogeth": 17, "cannot": 18, "like": [18, 19], "come": 18, "django": 18, "payment": 18, "dotpai": 18, "dotpayprovid": 18, "seller_id": 18, "pin": 18, "lock": 18, "lang": 18, "pl": 18, "backend": 18, "implement": 18, "popular": 18, "polish": 18, "gatewai": 18, "api": 18, "transfer": 18, "purchas": 18, "item": [18, 20], "seller": 18, "assign": 18, "consult": 18, "ui": 18, "data_item_1": 18, "restructuredtext": [19, 20], "demonstr": [19, 20, 21], "demo": 19, "parser": 19, "emphasi": 19, "strong": 19, "standalon": 19, "hyperlink": 19, "intern": 19, "cross": 19, "uri": 19, "web": 19, "anonym": 19, "symbol": 19, "substitut": 19, "below": 19, "charact": 19, "possibl": 19, "although": 19, "exceedingli": 19, "ugli": 19, "problemat": 19, "intent": 19, "ext": [19, 20], "autodoc": [19, 20], "test_py_modul": [19, 23], "right": 19, "my": 19, "role": 19, "interpret": 19, "explicit": 19, "pep": 19, "287": 19, "rfc": 19, "2822": 19, "subscript": 19, "superscript": 19, "standard": 19, "gui": 19, "action": 19, "taken": 19, "user": [19, 20], "height": 19, "interfer": 19, "adjac": 19, "bind": 19, "press": 19, "keyboard": 19, "mous": 19, "mmb": 19, "shift": 19, "anoth": [19, 20], "menuselect": 19, "short": [19, 20], "softwar": 19, "seen": [19, 20], "break": 19, "fit": 19, "sub": 19, "wrap": [19, 22], "whitespac": 19, "signific": 19, "strang": 19, "hyphen": 19, "word": 19, "adjust": 19, "width": 19, "browser": 19, "now": 19, "space": [19, 20], "sentenc": 19, "suppli": 19, "258": 19, "equat": 19, "x_": 19, "x_0": 19, "x_1": 19, "x_2": 19, "x_3": 19, "x_4": 19, "nabla": 19, "f": 19, "frac": 19, "partial": 19, "sin": 19, "theta": 19, "phi": 19, "eq": 19, "colon": 19, "indent": 19, "literal_block": 19, "spaces_and_linebreak": 19, "preserv": 19, "markup_process": 19, "Or": 19, "great": 19, "idea": 19, "why": 19, "didn": 19, "think": 19, "blank": 19, "begin": 19, "initi": 19, "continu": 19, "portion": 19, "edg": 19, "align": 19, "second": 19, "permit": 19, "awai": 19, "eric": 19, "orchestra": 19, "leader": 19, "three": [19, 20], "four": [19, 20], "half": 19, "bee": 19, "philosoph": 19, "ipso": 19, "facto": 19, "But": 19, "got": 19, "vi": 19, "entiti": 19, "said": 19, "entir": 19, "ancient": 19, "injuri": 19, "sing": 19, "consist": 19, "bodi": [19, 20], "theori": 19, "elk": 19, "bracket": 19, "goe": 19, "brontosaurus": 19, "thin": 19, "thicker": 19, "middl": 19, "again": 19, "far": 19, "mine": 19, "belong": 19, "me": [19, 20], "own": 19, "ann": 19, "begun": 19, "cut": 19, "past": 19, "interact": 19, "session": 19, "pars": 19, "curl": 19, "someurl": 19, "gz": [19, 20], "caption": [19, 22], "pane": 19, "shell_command": 19, "echo": 19, "did": 19, "window_nam": 19, "form": 19, "session_nam": 19, "shorthand": 19, "some_funct": 19, "interest": 19, "highlight": 19, "THE": 19, "heaven": 19, "hexagram": 19, "six": 19, "unbroken": 19, "stand": 19, "primal": 19, "power": 19, "light": 19, "activ": 19, "spirit": 19, "weak": 19, "essenc": 19, "energi": 19, "Its": 19, "repres": 19, "unrestrict": 19, "condit": 19, "therefor": 19, "conceiv": 19, "motion": 19, "regard": 19, "basi": 19, "durat": 19, "dual": 19, "sens": 19, "term": [19, 20], "univers": 19, "world": 19, "men": 19, "relat": 19, "express": 19, "deiti": 19, "human": 19, "denot": 19, "holi": 19, "man": [19, 20], "sage": 19, "ruler": 19, "hi": [19, 20], "awaken": 19, "utf": [19, 20], "sphinx_rtd_them": [19, 20], "nest": [19, 20], "dl": 19, "dt": 19, "tt": 19, "descnam": 19, "descclassnam": 19, "normal": 19, "just": [19, 21], "wrote": 19, "anyth": [19, 20], "els": [19, 20], "programm": 19, "myclass": 19, "dothismethod": 19, "meth": 19, "capit": 19, "flox": 19, "One": [19, 20], "least": [19, 20], "sequenc": 19, "unreferenc": 19, "nonexist": 19, "_": 19, "extrem": 19, "tell": 19, "doesn": 19, "respons": 19, "stuff": 19, "mayb": 19, "bold": 19, "ital": 19, "heck": 19, "backlink": 19, "definit": 19, "thing": 19, "knowledg": 19, "someth": 19, "ones": 19, "mind": 19, "ey": 19, "thought": 19, "medium": 19, "peopl": 19, "implicit": 19, "subsect": 19, "interpol": 19, "indirect": 19, "phrase": 19, "sampl": 19, "docutil": [19, 20], "sourceforg": [19, 20], "net": [19, 20], "ref": 19, "statement": 19, "clickabl": 19, "legend": 19, "revis": [19, 20], "revisit": 19, "enhanc": 19, "structuredtext": 19, "wooden": 19, "nickel": 19, "mad": 19, "scientist": 19, "bigger": 19, "bread": 19, "box": 19, "wash": 19, "behind": 19, "ear": 19, "room": 19, "closet": 19, "bathroom": 19, "trash": 19, "sink": 19, "mother": 19, "g_": 19, "mu": 19, "nu": 19, "pi": 19, "t_": 19, "rho_": 19, "lambda": 19, "15": [19, 23], "servic": 19, "thing1": 19, "thing2": 19, "thing3": 19, "prose": 19, "provok": 19, "mental": 19, "exert": 19, "reader": 19, "discret": 19, "strongli": 19, "advis": 19, "subtitl": 19, "besid": 19, "border": 19, "background": 19, "color": 19, "try": [19, 20], "best": 19, "around": [19, 22], "connect": 19, "ok": 19, "transmit": 19, "disconnect": 19, "simpl": [19, 20], "nonetheless": 19, "semant": 19, "produc": 19, "blue": 19, "lead": 19, "white": 19, "arab": 20, "numer": 20, "alpha": 20, "roman": 20, "upper": 20, "iii": 20, "iv": 20, "classifi": 20, "paragraph": [20, 23], "regardless": 20, "z": 20, "verbos": 20, "commonli": 20, "vm": 20, "There": 20, "author": 20, "david": 20, "goodger": 20, "123": 20, "street": 20, "ex": [20, 22], "canada": 20, "a1b": 20, "2c3": 20, "contact": 20, "myself": 20, "humankind": 20, "2012": 20, "03": 20, "19": [20, 23], "23": 20, "53": 20, "0000": 20, "tue": 20, "jan": 20, "statu": 20, "progress": 20, "7302": 20, "copyright": 20, "been": 20, "domain": 20, "wish": 20, "modifi": 20, "redistribut": 20, "reattribut": 20, "sell": 20, "bui": 20, "rent": 20, "leas": 20, "destroi": 20, "improv": 20, "quot": 20, "excerpt": 20, "incorpor": 20, "collat": 20, "fold": 20, "stapl": 20, "mutil": 20, "anyon": 20, "heart": 20, "bibliograph": 20, "dedic": 20, "co": 20, "abstract": 20, "markup": [20, 23], "advanc": 20, "third": 20, "inlin": [20, 23], "literal": 20, "yahoo": 20, "inner": 20, "oh": 20, "liter": 20, "heh": 20, "child": 20, "beat": 20, "emb": 20, "text": [20, 22], "hehe": 20, "sai": 20, "cackl": 20, "night": 20, "lone": 20, "guangzhou": 20, "destini": 20, "hope": 20, "dream": 20, "forth": 20, "fifth": 20, "sixth": 20, "figur": [20, 22], "lorem": [20, 22], "ipsum": [20, 22], "dolor": [20, 22], "amet": [20, 22], "consectetur": [20, 22], "adipisc": [20, 22], "elit": [20, 22], "donec": [20, 22], "porttitor": [20, 22], "odio": [20, 22], "posuer": [20, 22], "vita": [20, 22], "ornar": [20, 22], "libero": [20, 22], "matti": 20, "loborti": [20, 22], "justo": [20, 22], "vestibulum": [20, 22], "nibh": [20, 22], "aliquet": [20, 22], "feugiat": [20, 22], "sagitti": [20, 22], "nequ": [20, 22], "qui": [20, 22], "eleifend": 20, "dui": [20, 22], "rutrum": [20, 22], "lectu": [20, 22], "suscipit": [20, 22], "letter": 20, "cell": 20, "span": 20, "nam": [20, 22], "mauri": [20, 22], "arcu": [20, 22], "stub": 20, "behav": 21, "holder": 21, "interdum": 22, "nec": 22, "finibu": 22, "dictum": 22, "velit": 22, "ut": 22, "eu": 22, "efficitur": 22, "aliquam": 22, "erat": 22, "diam": 22, "gravida": 22, "imperdiet": 22, "tellu": 22, "nisl": 22, "praesent": 22, "eget": 22, "elementum": 22, "rhoncu": 22, "tincidunt": 22, "suspendiss": 22, "volutpat": 22, "scelerisqu": 22, "tristiqu": 22, "aenean": 22, "condimentum": 22, "risu": 22, "accumsan": 22, "laoreet": 22, "maximu": 22, "sapien": 22, "ligula": 22, "fringilla": 22, "commodo": 22, "proin": 22, "et": 22, "pharetra": 22, "etiam": 22, "turpi": 22, "ant": 22, "luctu": 22, "vel": 22, "malesuada": 22, "dignissim": 22, "mi": 22, "nunc": 22, "augu": 22, "sem": 22, "cursu": 22, "nulla": 22, "pellentesqu": 22, "habit": 22, "morbi": 22, "senectu": 22, "netu": 22, "fame": 22, "ac": 22, "egesta": 22, "placerat": 22, "tortor": 22, "iaculi": 22, "venenati": 22, "cra": 22, "puru": 22, "ero": 22, "vehicula": 22, "fusc": 22, "auctor": 22, "phasellu": 22, "est": 22, "viverra": 22, "conval": 22, "faucibu": 22, "vulput": 22, "feli": 22, "sodal": 22, "maecena": 22, "congu": 22, "semper": 22, "enim": 22, "blandit": 22, "sollicitudin": 22, "urna": 22, "orci": 22, "lacu": 22, "quisqu": 22, "facilisi": 22, "hendrerit": 22, "curabitur": 22, "variu": 22, "bibendum": 22, "massa": 22, "magna": 22, "tempu": 22, "metu": 22, "nisi": 22, "pretium": 22, "leo": 22, "euismod": 22, "ultric": 22, "potenti": 22, "dapibu": 22, "lacinia": 22, "vivamu": 22, "molesti": 22, "hac": 22, "habitass": 22, "platea": 22, "dictumst": 22, "wide": 23, "changelog": 23, "math": 23, "mod": 23, "14": 23, "16": 23, "submenu": 23, "symlink": 24, "subtre": 24, "_theme": 24, "html_theme": 24, "html_theme_path": 24}, "objects": {"": [[6, 0, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref"], [6, 1, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref::ebits"], [6, 1, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref::exponent_bias"], [6, 1, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref::input"], [6, 1, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref::ncols"], [6, 1, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref::nrows"], [6, 1, 1, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi", "FP8QuantizedToFloat_ref::output"], [6, 0, 1, "_CPPv423FP8rowwise_to_float_cpuRK6TensorbK7int64_t", "FP8rowwise_to_float_cpu"], [6, 1, 1, "_CPPv423FP8rowwise_to_float_cpuRK6TensorbK7int64_t", "FP8rowwise_to_float_cpu::forward"], [6, 1, 1, "_CPPv423FP8rowwise_to_float_cpuRK6TensorbK7int64_t", "FP8rowwise_to_float_cpu::input"], [6, 1, 1, "_CPPv423FP8rowwise_to_float_cpuRK6TensorbK7int64_t", "FP8rowwise_to_float_cpu::output_dtype"], [6, 0, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref"], [6, 1, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::ebits"], [6, 1, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::exponent_bias"], [6, 1, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::input"], [6, 1, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::max_pos"], [6, 1, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::ncols"], [6, 1, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::nrows"], [6, 1, 1, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd", "FloatToFP8Quantized_ref::output"], [6, 0, 1, "_CPPv434_float_to_fused8bitrowwise_cpu_outR6TensorRK6Tensor", "_float_to_fused8bitrowwise_cpu_out"], [6, 1, 1, "_CPPv434_float_to_fused8bitrowwise_cpu_outR6TensorRK6Tensor", "_float_to_fused8bitrowwise_cpu_out::input"], [6, 1, 1, "_CPPv434_float_to_fused8bitrowwise_cpu_outR6TensorRK6Tensor", "_float_to_fused8bitrowwise_cpu_out::output"], [6, 0, 1, "_CPPv4I0E32_float_to_fusednbitrowwise_gpu_t6TensorRK6TensorK7int64_t", "_float_to_fusednbitrowwise_gpu_t"], [6, 1, 1, "_CPPv4I0E32_float_to_fusednbitrowwise_gpu_t6TensorRK6TensorK7int64_t", "_float_to_fusednbitrowwise_gpu_t::bit_rate"], [6, 1, 1, "_CPPv4I0E32_float_to_fusednbitrowwise_gpu_t6TensorRK6TensorK7int64_t", "_float_to_fusednbitrowwise_gpu_t::input"], [6, 2, 1, "_CPPv4I0E32_float_to_fusednbitrowwise_gpu_t6TensorRK6TensorK7int64_t", "_float_to_fusednbitrowwise_gpu_t::input_t"], [6, 0, 1, "_CPPv434_fused8bitrowwise_to_float_cpu_outR6TensorRK6Tensor", "_fused8bitrowwise_to_float_cpu_out"], [6, 1, 1, "_CPPv434_fused8bitrowwise_to_float_cpu_outR6TensorRK6Tensor", "_fused8bitrowwise_to_float_cpu_out::input"], [6, 1, 1, "_CPPv434_fused8bitrowwise_to_float_cpu_outR6TensorRK6Tensor", "_fused8bitrowwise_to_float_cpu_out::output"], [6, 0, 1, "_CPPv4I0E32_fusednbitrowwise_to_float_gpu_t6TensorRK6TensorK7int64_t", "_fusednbitrowwise_to_float_gpu_t"], [6, 1, 1, "_CPPv4I0E32_fusednbitrowwise_to_float_gpu_t6TensorRK6TensorK7int64_t", "_fusednbitrowwise_to_float_gpu_t::bit_rate"], [6, 1, 1, "_CPPv4I0E32_fusednbitrowwise_to_float_gpu_t6TensorRK6TensorK7int64_t", "_fusednbitrowwise_to_float_gpu_t::input"], [6, 2, 1, "_CPPv4I0E32_fusednbitrowwise_to_float_gpu_t6TensorRK6TensorK7int64_t", "_fusednbitrowwise_to_float_gpu_t::output_t"], [5, 0, 1, "_CPPv417all_to_one_deviceNSt6vectorIN2at6TensorEEEN2at6DeviceE", "all_to_one_device"], [5, 1, 1, "_CPPv417all_to_one_deviceNSt6vectorIN2at6TensorEEEN2at6DeviceE", "all_to_one_device::inputTensors"], [5, 1, 1, "_CPPv417all_to_one_deviceNSt6vectorIN2at6TensorEEEN2at6DeviceE", "all_to_one_device::target_device"], [2, 0, 1, "_CPPv431batched_dense_vec_jagged_2d_mulRK6TensorRK6TensorRK6Tensor", "batched_dense_vec_jagged_2d_mul"], [2, 1, 1, "_CPPv431batched_dense_vec_jagged_2d_mulRK6TensorRK6TensorRK6Tensor", "batched_dense_vec_jagged_2d_mul::a_offsets"], [2, 1, 1, "_CPPv431batched_dense_vec_jagged_2d_mulRK6TensorRK6TensorRK6Tensor", "batched_dense_vec_jagged_2d_mul::a_values"], [2, 1, 1, "_CPPv431batched_dense_vec_jagged_2d_mulRK6TensorRK6TensorRK6Tensor", "batched_dense_vec_jagged_2d_mul::v"], [0, 0, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_t", "bounds_check_indices_cuda"], [0, 1, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_t", "bounds_check_indices_cuda::B_ofsets"], [0, 1, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_t", "bounds_check_indices_cuda::bounds_check_mode"], [0, 1, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_t", "bounds_check_indices_cuda::indices"], [0, 1, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_t", "bounds_check_indices_cuda::max_B"], [0, 1, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_t", "bounds_check_indices_cuda::offsets"], [0, 1, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_t", "bounds_check_indices_cuda::rows_per_table"], [0, 1, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_t", "bounds_check_indices_cuda::warning"], [0, 1, 1, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_t", "bounds_check_indices_cuda::weights"], [2, 0, 1, "_CPPv415dense_to_jaggedRK6TensorRKNSt6vectorI6TensorEEN3c108optionalIN2at6SymIntEEE", "dense_to_jagged"], [2, 1, 1, "_CPPv415dense_to_jaggedRK6TensorRKNSt6vectorI6TensorEEN3c108optionalIN2at6SymIntEEE", "dense_to_jagged::dense"], [2, 1, 1, "_CPPv415dense_to_jaggedRK6TensorRKNSt6vectorI6TensorEEN3c108optionalIN2at6SymIntEEE", "dense_to_jagged::offsets"], [2, 1, 1, "_CPPv415dense_to_jaggedRK6TensorRKNSt6vectorI6TensorEEN3c108optionalIN2at6SymIntEEE", "dense_to_jagged::total_L"], [8, 0, 1, "_CPPv442direct_mapped_lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lru_cache_populate_byte_cuda"], [8, 1, 1, "_CPPv442direct_mapped_lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lru_cache_populate_byte_cuda::D_offsets"], [8, 1, 1, "_CPPv442direct_mapped_lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lru_cache_populate_byte_cuda::cache_index_table_map"], [8, 1, 1, "_CPPv442direct_mapped_lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lru_cache_populate_byte_cuda::gather_cache_stats"], [8, 1, 1, "_CPPv442direct_mapped_lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lru_cache_populate_byte_cuda::hash_size_cumsum"], [8, 1, 1, "_CPPv442direct_mapped_lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lru_cache_populate_byte_cuda::linear_cache_indices"], [8, 1, 1, "_CPPv442direct_mapped_lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lru_cache_populate_byte_cuda::lru_state"], [8, 1, 1, "_CPPv442direct_mapped_lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lru_cache_populate_byte_cuda::lxu_cache_miss_timestamp"], [8, 1, 1, "_CPPv442direct_mapped_lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lru_cache_populate_byte_cuda::lxu_cache_state"], [8, 1, 1, "_CPPv442direct_mapped_lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lru_cache_populate_byte_cuda::lxu_cache_weights"], [8, 1, 1, "_CPPv442direct_mapped_lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lru_cache_populate_byte_cuda::row_alignment"], [8, 1, 1, "_CPPv442direct_mapped_lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lru_cache_populate_byte_cuda::time_stamp"], [8, 1, 1, "_CPPv442direct_mapped_lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lru_cache_populate_byte_cuda::total_cache_hash_size"], [8, 1, 1, "_CPPv442direct_mapped_lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lru_cache_populate_byte_cuda::uvm_cache_stats"], [8, 1, 1, "_CPPv442direct_mapped_lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lru_cache_populate_byte_cuda::weights"], [8, 1, 1, "_CPPv442direct_mapped_lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lru_cache_populate_byte_cuda::weights_offsets"], [8, 1, 1, "_CPPv442direct_mapped_lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lru_cache_populate_byte_cuda::weights_tys"], [8, 0, 1, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lxu_cache_lookup_cuda"], [8, 1, 1, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lxu_cache_lookup_cuda::gather_cache_stats"], [8, 1, 1, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lxu_cache_lookup_cuda::invalid_index"], [8, 1, 1, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lxu_cache_lookup_cuda::linear_cache_indices"], [8, 1, 1, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lxu_cache_lookup_cuda::lxu_cache_state"], [8, 1, 1, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "direct_mapped_lxu_cache_lookup_cuda::uvm_cache_stats"], [7, 0, 1, "_CPPv431expand_into_jagged_permute_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_t", "expand_into_jagged_permute_cuda"], [7, 1, 1, "_CPPv431expand_into_jagged_permute_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_t", "expand_into_jagged_permute_cuda::input_offsets"], [7, 1, 1, "_CPPv431expand_into_jagged_permute_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_t", "expand_into_jagged_permute_cuda::output_offsets"], [7, 1, 1, "_CPPv431expand_into_jagged_permute_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_t", "expand_into_jagged_permute_cuda::output_size"], [7, 1, 1, "_CPPv431expand_into_jagged_permute_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_t", "expand_into_jagged_permute_cuda::permute"], [6, 0, 1, "_CPPv437float_or_half_to_fused8bitrowwise_cpuRK6Tensor", "float_or_half_to_fused8bitrowwise_cpu"], [6, 1, 1, "_CPPv437float_or_half_to_fused8bitrowwise_cpuRK6Tensor", "float_or_half_to_fused8bitrowwise_cpu::input"], [6, 0, 1, "_CPPv423float_to_FP8rowwise_cpuRK6Tensorb", "float_to_FP8rowwise_cpu"], [6, 1, 1, "_CPPv423float_to_FP8rowwise_cpuRK6Tensorb", "float_to_FP8rowwise_cpu::forward"], [6, 1, 1, "_CPPv423float_to_FP8rowwise_cpuRK6Tensorb", "float_to_FP8rowwise_cpu::input"], [6, 0, 1, "_CPPv429float_to_fused8bitrowwise_cpuRK6Tensor", "float_to_fused8bitrowwise_cpu"], [6, 1, 1, "_CPPv429float_to_fused8bitrowwise_cpuRK6Tensor", "float_to_fused8bitrowwise_cpu::input"], [6, 0, 1, "_CPPv429fused8bitrowwise_to_float_cpuRK6Tensor", "fused8bitrowwise_to_float_cpu"], [6, 1, 1, "_CPPv429fused8bitrowwise_to_float_cpuRK6Tensor", "fused8bitrowwise_to_float_cpu::input"], [6, 0, 1, "_CPPv437fused8bitrowwise_to_float_or_half_cpuRK6TensorK7int64_t", "fused8bitrowwise_to_float_or_half_cpu"], [6, 1, 1, "_CPPv437fused8bitrowwise_to_float_or_half_cpuRK6TensorK7int64_t", "fused8bitrowwise_to_float_or_half_cpu::input"], [6, 1, 1, "_CPPv437fused8bitrowwise_to_float_or_half_cpuRK6TensorK7int64_t", "fused8bitrowwise_to_float_or_half_cpu::output_dtype"], [6, 0, 1, "_CPPv428fused8bitrowwise_to_half_cpuRK6Tensor", "fused8bitrowwise_to_half_cpu"], [6, 1, 1, "_CPPv428fused8bitrowwise_to_half_cpuRK6Tensor", "fused8bitrowwise_to_half_cpu::input"], [6, 0, 1, "_CPPv429fusednbitrowwise_to_float_cpuRK6TensorK7int64_t", "fusednbitrowwise_to_float_cpu"], [6, 1, 1, "_CPPv429fusednbitrowwise_to_float_cpuRK6TensorK7int64_t", "fusednbitrowwise_to_float_cpu::bit_rate"], [6, 1, 1, "_CPPv429fusednbitrowwise_to_float_cpuRK6TensorK7int64_t", "fusednbitrowwise_to_float_cpu::input"], [6, 0, 1, "_CPPv437fusednbitrowwise_to_float_or_half_cpuRK6TensorK7int64_tK7int64_t", "fusednbitrowwise_to_float_or_half_cpu"], [6, 1, 1, "_CPPv437fusednbitrowwise_to_float_or_half_cpuRK6TensorK7int64_tK7int64_t", "fusednbitrowwise_to_float_or_half_cpu::bit_rate"], [6, 1, 1, "_CPPv437fusednbitrowwise_to_float_or_half_cpuRK6TensorK7int64_tK7int64_t", "fusednbitrowwise_to_float_or_half_cpu::input"], [6, 1, 1, "_CPPv437fusednbitrowwise_to_float_or_half_cpuRK6TensorK7int64_tK7int64_t", "fusednbitrowwise_to_float_or_half_cpu::output_dtype"], [6, 0, 1, "_CPPv428fusednbitrowwise_to_half_cpuRK6TensorK7int64_t", "fusednbitrowwise_to_half_cpu"], [6, 1, 1, "_CPPv428fusednbitrowwise_to_half_cpuRK6TensorK7int64_t", "fusednbitrowwise_to_half_cpu::bit_rate"], [6, 1, 1, "_CPPv428fusednbitrowwise_to_half_cpuRK6TensorK7int64_t", "fusednbitrowwise_to_half_cpu::input"], [7, 0, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu"], [7, 1, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::bin_boundaries"], [7, 1, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::bin_ctr_in_use_after"], [7, 1, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::bin_ctr_weight_value"], [7, 1, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::bin_num_examples"], [7, 1, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::bin_num_positives"], [7, 1, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::logit"], [7, 1, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::num_segments"], [7, 1, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::positive_weight"], [7, 1, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::segment_lengths"], [7, 1, 1, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td", "generic_histogram_binning_calibration_by_feature_cpu::segment_value"], [8, 0, 1, "_CPPv423get_unique_indices_cudaN2at6TensorE7int64_tb", "get_unique_indices_cuda"], [8, 1, 1, "_CPPv423get_unique_indices_cudaN2at6TensorE7int64_tb", "get_unique_indices_cuda::compute_count"], [8, 1, 1, "_CPPv423get_unique_indices_cudaN2at6TensorE7int64_tb", "get_unique_indices_cuda::linear_indices"], [8, 1, 1, "_CPPv423get_unique_indices_cudaN2at6TensorE7int64_tb", "get_unique_indices_cuda::max_indices"], [6, 0, 1, "_CPPv428half_to_fused8bitrowwise_cpuRK6Tensor", "half_to_fused8bitrowwise_cpu"], [6, 1, 1, "_CPPv428half_to_fused8bitrowwise_cpuRK6Tensor", "half_to_fused8bitrowwise_cpu::input"], [7, 0, 1, "_CPPv433histogram_binning_calibration_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorEddd7int64_td", "histogram_binning_calibration_cpu"], [7, 1, 1, "_CPPv433histogram_binning_calibration_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorEddd7int64_td", "histogram_binning_calibration_cpu::bin_ctr_in_use_after"], [7, 1, 1, "_CPPv433histogram_binning_calibration_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorEddd7int64_td", "histogram_binning_calibration_cpu::bin_ctr_weight_value"], [7, 1, 1, "_CPPv433histogram_binning_calibration_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorEddd7int64_td", "histogram_binning_calibration_cpu::bin_num_examples"], [7, 1, 1, "_CPPv433histogram_binning_calibration_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorEddd7int64_td", "histogram_binning_calibration_cpu::bin_num_positives"], [7, 1, 1, "_CPPv433histogram_binning_calibration_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorEddd7int64_td", "histogram_binning_calibration_cpu::logit"], [7, 1, 1, "_CPPv433histogram_binning_calibration_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorEddd7int64_td", "histogram_binning_calibration_cpu::lower_bound"], [7, 1, 1, "_CPPv433histogram_binning_calibration_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorEddd7int64_td", "histogram_binning_calibration_cpu::positive_weight"], [7, 1, 1, "_CPPv433histogram_binning_calibration_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorEddd7int64_td", "histogram_binning_calibration_cpu::upper_bound"], [8, 0, 1, "_CPPv419host_lxu_cache_slot7int64_t7int64_t", "host_lxu_cache_slot"], [8, 1, 1, "_CPPv419host_lxu_cache_slot7int64_t7int64_t", "host_lxu_cache_slot::C"], [8, 1, 1, "_CPPv419host_lxu_cache_slot7int64_t7int64_t", "host_lxu_cache_slot::h_in"], [0, 0, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::D_offsets"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::dev_weights"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::fp8_exponent_bias"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::fp8_exponent_bits"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::indice_weights"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::indices"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::lxu_cache_locations"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::lxu_cache_weights"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::max_float16_D"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::max_float32_D"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::max_float8_D"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::max_int2_D"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::max_int4_D"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::max_int8_D"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::offsets"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::output_dtype"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::pooling_mode"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::row_alignment"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::total_D"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::uvm_weights"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::weights_offsets"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::weights_placements"], [0, 1, 1, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function::weights_tys"], [0, 0, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::D_offsets"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::dev_weights"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::fp8_exponent_bias"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::fp8_exponent_bits"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::indice_weights"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::indices"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::lxu_cache_locations"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::lxu_cache_weights"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::max_float16_D"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::max_float32_D"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::max_float8_D"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::max_int2_D"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::max_int4_D"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::max_int8_D"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::offsets"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::output_dtype"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::pooling_mode"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::row_alignment"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::total_D"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::uvm_weights"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::weights_offsets"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::weights_placements"], [0, 1, 1, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE", "int_nbit_split_embedding_codegen_lookup_function_cpu::weights_tys"], [0, 0, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::D_offsets"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::cache_hash_size_cumsum"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::cache_index_table_map"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::dev_weights"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::fp8_exponent_bias"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::fp8_exponent_bits"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::indice_weights"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::indices"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::lxu_cache_locations"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::lxu_cache_state"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::lxu_cache_weights"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::lxu_state"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::max_float16_D"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::max_float32_D"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::max_float8_D"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::max_int2_D"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::max_int4_D"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::max_int8_D"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::offsets"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::output_dtype"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::pooling_mode"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::row_alignment"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::total_D"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::total_cache_hash_size"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::uvm_weights"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::weights_offsets"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::weights_placements"], [0, 1, 1, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function::weights_tys"], [0, 0, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::D_offsets"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::cache_hash_size_cumsum"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::cache_index_table_map"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::dev_weights"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::fp8_exponent_bias"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::fp8_exponent_bits"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::indice_weights"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::indices"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::lxu_cache_locations"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::lxu_cache_state"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::lxu_cache_weights"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::lxu_state"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::max_float16_D"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::max_float32_D"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::max_float8_D"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::max_int2_D"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::max_int4_D"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::max_int8_D"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::offsets"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::output_dtype"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::pooling_mode"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::row_alignment"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::total_D"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::total_cache_hash_size"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::uvm_weights"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::weights_offsets"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::weights_placements"], [0, 1, 1, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE", "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu::weights_tys"], [4, 0, 1, "_CPPv413is_uvm_tensorRK6Tensor", "is_uvm_tensor"], [4, 1, 1, "_CPPv413is_uvm_tensorRK6Tensor", "is_uvm_tensor::self"], [2, 0, 1, "_CPPv418jagged_1d_to_dense6Tensor6TensorN3c106SymIntE7int64_t", "jagged_1d_to_dense"], [2, 1, 1, "_CPPv418jagged_1d_to_dense6Tensor6TensorN3c106SymIntE7int64_t", "jagged_1d_to_dense::max_L"], [2, 1, 1, "_CPPv418jagged_1d_to_dense6Tensor6TensorN3c106SymIntE7int64_t", "jagged_1d_to_dense::offsets"], [2, 1, 1, "_CPPv418jagged_1d_to_dense6Tensor6TensorN3c106SymIntE7int64_t", "jagged_1d_to_dense::padding_value"], [2, 1, 1, "_CPPv418jagged_1d_to_dense6Tensor6TensorN3c106SymIntE7int64_t", "jagged_1d_to_dense::values"], [2, 0, 1, "_CPPv418jagged_2d_to_dense6Tensor6TensorN3c106SymIntE", "jagged_2d_to_dense"], [2, 1, 1, "_CPPv418jagged_2d_to_dense6Tensor6TensorN3c106SymIntE", "jagged_2d_to_dense::max_sequence_length"], [2, 1, 1, "_CPPv418jagged_2d_to_dense6Tensor6TensorN3c106SymIntE", "jagged_2d_to_dense::offsets"], [2, 1, 1, "_CPPv418jagged_2d_to_dense6Tensor6TensorN3c106SymIntE", "jagged_2d_to_dense::values"], [2, 0, 1, "_CPPv428jagged_dense_elementwise_addRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add"], [2, 1, 1, "_CPPv428jagged_dense_elementwise_addRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add::x_offsets"], [2, 1, 1, "_CPPv428jagged_dense_elementwise_addRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add::x_values"], [2, 1, 1, "_CPPv428jagged_dense_elementwise_addRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add::y"], [2, 0, 1, "_CPPv442jagged_dense_elementwise_add_jagged_outputRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output"], [2, 1, 1, "_CPPv442jagged_dense_elementwise_add_jagged_outputRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output::x_offsets"], [2, 1, 1, "_CPPv442jagged_dense_elementwise_add_jagged_outputRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output::x_values"], [2, 1, 1, "_CPPv442jagged_dense_elementwise_add_jagged_outputRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output::y"], [2, 0, 1, "_CPPv447jagged_dense_elementwise_add_jagged_output_cudaRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output_cuda"], [2, 1, 1, "_CPPv447jagged_dense_elementwise_add_jagged_output_cudaRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output_cuda::x_offsets"], [2, 1, 1, "_CPPv447jagged_dense_elementwise_add_jagged_output_cudaRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output_cuda::x_values"], [2, 1, 1, "_CPPv447jagged_dense_elementwise_add_jagged_output_cudaRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_add_jagged_output_cuda::y"], [2, 0, 1, "_CPPv428jagged_dense_elementwise_mulRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_mul"], [2, 1, 1, "_CPPv428jagged_dense_elementwise_mulRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_mul::x_offsets"], [2, 1, 1, "_CPPv428jagged_dense_elementwise_mulRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_mul::x_values"], [2, 1, 1, "_CPPv428jagged_dense_elementwise_mulRK6TensorRKNSt6vectorI6TensorEERK6Tensor", "jagged_dense_elementwise_mul::y"], [2, 0, 1, "_CPPv422jagged_to_padded_denseRK6TensorRKNSt6vectorI6TensorEEKN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense"], [2, 1, 1, "_CPPv422jagged_to_padded_denseRK6TensorRKNSt6vectorI6TensorEEKN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense::max_lengths"], [2, 1, 1, "_CPPv422jagged_to_padded_denseRK6TensorRKNSt6vectorI6TensorEEKN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense::offsets"], [2, 1, 1, "_CPPv422jagged_to_padded_denseRK6TensorRKNSt6vectorI6TensorEEKN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense::padding_value"], [2, 1, 1, "_CPPv422jagged_to_padded_denseRK6TensorRKNSt6vectorI6TensorEEKN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense::values"], [2, 0, 1, "_CPPv430jagged_to_padded_dense_forwardRK6TensorRKNSt6vectorI6TensorEEN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense_forward"], [2, 1, 1, "_CPPv430jagged_to_padded_dense_forwardRK6TensorRKNSt6vectorI6TensorEEN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense_forward::max_lengths"], [2, 1, 1, "_CPPv430jagged_to_padded_dense_forwardRK6TensorRKNSt6vectorI6TensorEEN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense_forward::offsets"], [2, 1, 1, "_CPPv430jagged_to_padded_dense_forwardRK6TensorRKNSt6vectorI6TensorEEN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense_forward::padding_value"], [2, 1, 1, "_CPPv430jagged_to_padded_dense_forwardRK6TensorRKNSt6vectorI6TensorEEN3c1014SymIntArrayRefEKd", "jagged_to_padded_dense_forward::values"], [8, 0, 1, "_CPPv428lfu_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "lfu_cache_populate_byte_cuda"], [8, 1, 1, "_CPPv428lfu_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "lfu_cache_populate_byte_cuda::D_offsets"], [8, 1, 1, "_CPPv428lfu_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "lfu_cache_populate_byte_cuda::cache_hash_size_cumsum"], [8, 1, 1, "_CPPv428lfu_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "lfu_cache_populate_byte_cuda::cache_index_table_map"], [8, 1, 1, "_CPPv428lfu_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "lfu_cache_populate_byte_cuda::lfu_state"], [8, 1, 1, "_CPPv428lfu_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "lfu_cache_populate_byte_cuda::linear_cache_indices"], [8, 1, 1, "_CPPv428lfu_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "lfu_cache_populate_byte_cuda::lxu_cache_state"], [8, 1, 1, "_CPPv428lfu_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "lfu_cache_populate_byte_cuda::lxu_cache_weights"], [8, 1, 1, "_CPPv428lfu_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "lfu_cache_populate_byte_cuda::row_alignment"], [8, 1, 1, "_CPPv428lfu_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "lfu_cache_populate_byte_cuda::total_cache_hash_size"], [8, 1, 1, "_CPPv428lfu_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "lfu_cache_populate_byte_cuda::weights"], [8, 1, 1, "_CPPv428lfu_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "lfu_cache_populate_byte_cuda::weights_offsets"], [8, 1, 1, "_CPPv428lfu_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "lfu_cache_populate_byte_cuda::weights_tys"], [8, 0, 1, "_CPPv423lfu_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEb", "lfu_cache_populate_cuda"], [8, 1, 1, "_CPPv423lfu_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEb", "lfu_cache_populate_cuda::D_offsets"], [8, 1, 1, "_CPPv423lfu_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEb", "lfu_cache_populate_cuda::cache_hash_size_cumsum"], [8, 1, 1, "_CPPv423lfu_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEb", "lfu_cache_populate_cuda::cache_index_table_map"], [8, 1, 1, "_CPPv423lfu_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEb", "lfu_cache_populate_cuda::lfu_state"], [8, 1, 1, "_CPPv423lfu_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEb", "lfu_cache_populate_cuda::linear_cache_indices"], [8, 1, 1, "_CPPv423lfu_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEb", "lfu_cache_populate_cuda::lxu_cache_state"], [8, 1, 1, "_CPPv423lfu_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEb", "lfu_cache_populate_cuda::lxu_cache_weights"], [8, 1, 1, "_CPPv423lfu_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEb", "lfu_cache_populate_cuda::stochastic_rounding"], [8, 1, 1, "_CPPv423lfu_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEb", "lfu_cache_populate_cuda::total_cache_hash_size"], [8, 1, 1, "_CPPv423lfu_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEb", "lfu_cache_populate_cuda::weights"], [8, 1, 1, "_CPPv423lfu_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEb", "lfu_cache_populate_cuda::weights_offsets"], [8, 0, 1, "_CPPv428linearize_cache_indices_cudaN2at6TensorEN2at6TensorEN2at6TensorE", "linearize_cache_indices_cuda"], [8, 1, 1, "_CPPv428linearize_cache_indices_cudaN2at6TensorEN2at6TensorEN2at6TensorE", "linearize_cache_indices_cuda::cache_hash_size_cumsum"], [8, 1, 1, "_CPPv428linearize_cache_indices_cudaN2at6TensorEN2at6TensorEN2at6TensorE", "linearize_cache_indices_cuda::indices"], [8, 1, 1, "_CPPv428linearize_cache_indices_cudaN2at6TensorEN2at6TensorEN2at6TensorE", "linearize_cache_indices_cuda::offsets"], [8, 0, 1, "_CPPv441linearize_cache_indices_from_row_idx_cudaN2at6TensorEN2at6TensorEN2at6TensorE", "linearize_cache_indices_from_row_idx_cuda"], [8, 1, 1, "_CPPv441linearize_cache_indices_from_row_idx_cudaN2at6TensorEN2at6TensorEN2at6TensorE", "linearize_cache_indices_from_row_idx_cuda::cache_hash_size_cumsum"], [8, 1, 1, "_CPPv441linearize_cache_indices_from_row_idx_cudaN2at6TensorEN2at6TensorEN2at6TensorE", "linearize_cache_indices_from_row_idx_cuda::update_row_indices"], [8, 1, 1, "_CPPv441linearize_cache_indices_from_row_idx_cudaN2at6TensorEN2at6TensorEN2at6TensorE", "linearize_cache_indices_from_row_idx_cuda::update_table_indices"], [8, 0, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorE", "lru_cache_find_uncached_cuda"], [8, 1, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorE", "lru_cache_find_uncached_cuda::gather_cache_stats"], [8, 1, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorE", "lru_cache_find_uncached_cuda::lock_cache_line"], [8, 1, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorE", "lru_cache_find_uncached_cuda::lru_state"], [8, 1, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorE", "lru_cache_find_uncached_cuda::lxu_cache_locking_counter"], [8, 1, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorE", "lru_cache_find_uncached_cuda::lxu_cache_state"], [8, 1, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorE", "lru_cache_find_uncached_cuda::max_indices"], [8, 1, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorE", "lru_cache_find_uncached_cuda::time_stamp"], [8, 1, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorE", "lru_cache_find_uncached_cuda::unique_indices"], [8, 1, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorE", "lru_cache_find_uncached_cuda::unique_indices_length"], [8, 1, 1, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorE", "lru_cache_find_uncached_cuda::uvm_cache_stats"], [8, 0, 1, "_CPPv428lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_byte_cuda"], [8, 1, 1, "_CPPv428lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_byte_cuda::D_offsets"], [8, 1, 1, "_CPPv428lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_byte_cuda::cache_index_table_map"], [8, 1, 1, "_CPPv428lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_byte_cuda::gather_cache_stats"], [8, 1, 1, "_CPPv428lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_byte_cuda::hash_size_cumsum"], [8, 1, 1, "_CPPv428lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_byte_cuda::linear_cache_indices"], [8, 1, 1, "_CPPv428lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_byte_cuda::lru_state"], [8, 1, 1, "_CPPv428lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_byte_cuda::lxu_cache_state"], [8, 1, 1, "_CPPv428lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_byte_cuda::lxu_cache_weights"], [8, 1, 1, "_CPPv428lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_byte_cuda::row_alignment"], [8, 1, 1, "_CPPv428lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_byte_cuda::time_stamp"], [8, 1, 1, "_CPPv428lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_byte_cuda::total_cache_hash_size"], [8, 1, 1, "_CPPv428lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_byte_cuda::uvm_cache_stats"], [8, 1, 1, "_CPPv428lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_byte_cuda::weights"], [8, 1, 1, "_CPPv428lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_byte_cuda::weights_offsets"], [8, 1, 1, "_CPPv428lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_byte_cuda::weights_tys"], [8, 0, 1, "_CPPv423lru_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEbbN3c108optionalIN2at6TensorEEEbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_cuda"], [8, 1, 1, "_CPPv423lru_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEbbN3c108optionalIN2at6TensorEEEbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_cuda::D_offsets"], [8, 1, 1, "_CPPv423lru_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEbbN3c108optionalIN2at6TensorEEEbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_cuda::cache_index_table_map"], [8, 1, 1, "_CPPv423lru_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEbbN3c108optionalIN2at6TensorEEEbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_cuda::gather_cache_stats"], [8, 1, 1, "_CPPv423lru_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEbbN3c108optionalIN2at6TensorEEEbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_cuda::hash_size_cumsum"], [8, 1, 1, "_CPPv423lru_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEbbN3c108optionalIN2at6TensorEEEbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_cuda::linear_cache_indices"], [8, 1, 1, "_CPPv423lru_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEbbN3c108optionalIN2at6TensorEEEbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_cuda::lock_cache_line"], [8, 1, 1, "_CPPv423lru_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEbbN3c108optionalIN2at6TensorEEEbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_cuda::lru_state"], [8, 1, 1, "_CPPv423lru_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEbbN3c108optionalIN2at6TensorEEEbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_cuda::lxu_cache_locking_counter"], [8, 1, 1, "_CPPv423lru_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEbbN3c108optionalIN2at6TensorEEEbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_cuda::lxu_cache_state"], [8, 1, 1, "_CPPv423lru_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEbbN3c108optionalIN2at6TensorEEEbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_cuda::lxu_cache_weights"], [8, 1, 1, "_CPPv423lru_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEbbN3c108optionalIN2at6TensorEEEbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_cuda::stochastic_rounding"], [8, 1, 1, "_CPPv423lru_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEbbN3c108optionalIN2at6TensorEEEbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_cuda::time_stamp"], [8, 1, 1, "_CPPv423lru_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEbbN3c108optionalIN2at6TensorEEEbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_cuda::total_cache_hash_size"], [8, 1, 1, "_CPPv423lru_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEbbN3c108optionalIN2at6TensorEEEbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_cuda::uvm_cache_stats"], [8, 1, 1, "_CPPv423lru_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEbbN3c108optionalIN2at6TensorEEEbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_cuda::weights"], [8, 1, 1, "_CPPv423lru_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEbbN3c108optionalIN2at6TensorEEEbN3c108optionalIN2at6TensorEEE", "lru_cache_populate_cuda::weights_offsets"], [8, 0, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda"], [8, 1, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::D_offsets"], [8, 1, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::cache_hash_size_cumsum"], [8, 1, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::cache_index_table_map"], [8, 1, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::lxu_cache_state"], [8, 1, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::lxu_cache_weights"], [8, 1, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::stochastic_rounding"], [8, 1, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::total_D"], [8, 1, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::uvm_weights"], [8, 1, 1, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb", "lxu_cache_flush_cuda::weights_offsets"], [8, 0, 1, "_CPPv431lxu_cache_locations_update_cudaN2at6TensorEN2at6TensorEN3c108optionalIN2at6TensorEEE", "lxu_cache_locations_update_cuda"], [8, 1, 1, "_CPPv431lxu_cache_locations_update_cudaN2at6TensorEN2at6TensorEN3c108optionalIN2at6TensorEEE", "lxu_cache_locations_update_cuda::lxu_cache_locations"], [8, 1, 1, "_CPPv431lxu_cache_locations_update_cudaN2at6TensorEN2at6TensorEN3c108optionalIN2at6TensorEEE", "lxu_cache_locations_update_cuda::lxu_cache_locations_new"], [8, 1, 1, "_CPPv431lxu_cache_locations_update_cudaN2at6TensorEN2at6TensorEN3c108optionalIN2at6TensorEEE", "lxu_cache_locations_update_cuda::num_uniq_cache_indices"], [8, 0, 1, "_CPPv440lxu_cache_locking_counter_decrement_cudaN2at6TensorEN2at6TensorE", "lxu_cache_locking_counter_decrement_cuda"], [8, 1, 1, "_CPPv440lxu_cache_locking_counter_decrement_cudaN2at6TensorEN2at6TensorE", "lxu_cache_locking_counter_decrement_cuda::lxu_cache_locations"], [8, 1, 1, "_CPPv440lxu_cache_locking_counter_decrement_cudaN2at6TensorEN2at6TensorE", "lxu_cache_locking_counter_decrement_cuda::lxu_cache_locking_counter"], [8, 0, 1, "_CPPv421lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEEN3c108optionalIN2at6TensorEEEN3c108optionalIN2at6TensorEEE", "lxu_cache_lookup_cuda"], [8, 1, 1, "_CPPv421lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEEN3c108optionalIN2at6TensorEEEN3c108optionalIN2at6TensorEEE", "lxu_cache_lookup_cuda::gather_cache_stats"], [8, 1, 1, "_CPPv421lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEEN3c108optionalIN2at6TensorEEEN3c108optionalIN2at6TensorEEE", "lxu_cache_lookup_cuda::invalid_index"], [8, 1, 1, "_CPPv421lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEEN3c108optionalIN2at6TensorEEEN3c108optionalIN2at6TensorEEE", "lxu_cache_lookup_cuda::linear_cache_indices"], [8, 1, 1, "_CPPv421lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEEN3c108optionalIN2at6TensorEEEN3c108optionalIN2at6TensorEEE", "lxu_cache_lookup_cuda::lxu_cache_locations_output"], [8, 1, 1, "_CPPv421lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEEN3c108optionalIN2at6TensorEEEN3c108optionalIN2at6TensorEEE", "lxu_cache_lookup_cuda::lxu_cache_state"], [8, 1, 1, "_CPPv421lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEEN3c108optionalIN2at6TensorEEEN3c108optionalIN2at6TensorEEE", "lxu_cache_lookup_cuda::num_uniq_cache_indices"], [8, 1, 1, "_CPPv421lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEEN3c108optionalIN2at6TensorEEEN3c108optionalIN2at6TensorEEE", "lxu_cache_lookup_cuda::uvm_cache_stats"], [4, 0, 1, "_CPPv422new_host_mapped_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_host_mapped_tensor"], [4, 1, 1, "_CPPv422new_host_mapped_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_host_mapped_tensor::self"], [4, 1, 1, "_CPPv422new_host_mapped_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_host_mapped_tensor::sizes"], [4, 0, 1, "_CPPv418new_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_managed_tensor"], [4, 1, 1, "_CPPv418new_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_managed_tensor::self"], [4, 1, 1, "_CPPv418new_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_managed_tensor::sizes"], [4, 0, 1, "_CPPv423new_managed_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_managed_tensor_meta"], [4, 1, 1, "_CPPv423new_managed_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_managed_tensor_meta::self"], [4, 1, 1, "_CPPv423new_managed_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_managed_tensor_meta::sizes"], [4, 0, 1, "_CPPv418new_unified_tensorRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor"], [4, 1, 1, "_CPPv418new_unified_tensorRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor::is_host_mapped"], [4, 1, 1, "_CPPv418new_unified_tensorRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor::self"], [4, 1, 1, "_CPPv418new_unified_tensorRK6TensorRKNSt6vectorINSt7int64_tEEEb", "new_unified_tensor::sizes"], [4, 0, 1, "_CPPv426new_vanilla_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_vanilla_managed_tensor"], [4, 1, 1, "_CPPv426new_vanilla_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_vanilla_managed_tensor::self"], [4, 1, 1, "_CPPv426new_vanilla_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE", "new_vanilla_managed_tensor::sizes"], [1, 0, 1, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t", "padding_fused_tbe_input_combine_cpu"], [1, 1, 1, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t", "padding_fused_tbe_input_combine_cpu::batch_size"], [1, 1, 1, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t", "padding_fused_tbe_input_combine_cpu::include_last_offsets"], [1, 1, 1, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t", "padding_fused_tbe_input_combine_cpu::indices_list"], [1, 1, 1, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t", "padding_fused_tbe_input_combine_cpu::offsets_list"], [1, 1, 1, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t", "padding_fused_tbe_input_combine_cpu::per_sample_weights"], [5, 0, 1, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad"], [5, 1, 1, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad::inv_offset_dim_list"], [5, 1, 1, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad::inv_permute_list"], [5, 1, 1, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad::offset_dim_list"], [5, 1, 1, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad::permute_list"], [5, 1, 1, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad::pooled_embs"], [5, 0, 1, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_cpu"], [5, 1, 1, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_cpu::inv_offset_dim_list"], [5, 1, 1, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_cpu::inv_permute_list"], [5, 1, 1, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_cpu::offset_dim_list"], [5, 1, 1, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_cpu::permute_list"], [5, 1, 1, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_cpu::pooled_embs"], [5, 0, 1, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_gpu"], [5, 1, 1, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_gpu::inv_offset_dim_list"], [5, 1, 1, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_gpu::inv_permute_list"], [5, 1, 1, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_gpu::offset_dim_list"], [5, 1, 1, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_gpu::permute_list"], [5, 1, 1, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor", "permute_pooled_embs_auto_grad_gpu::pooled_embs"], [5, 0, 1, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_cpu"], [5, 1, 1, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_cpu::inv_offset_dim_list"], [5, 1, 1, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_cpu::inv_permute_list"], [5, 1, 1, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_cpu::offset_dim_list"], [5, 1, 1, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_cpu::permute_list"], [5, 1, 1, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_cpu::pooled_embs"], [5, 0, 1, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_gpu"], [5, 1, 1, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_gpu::inv_offset_dim_list"], [5, 1, 1, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_gpu::inv_permute_list"], [5, 1, 1, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_gpu::offset_dim_list"], [5, 1, 1, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_gpu::permute_list"], [5, 1, 1, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_auto_grad_split_gpu::pooled_embs"], [5, 0, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl"], [5, 1, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl::allow_duplicates"], [5, 1, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl::inv_offset_dim_list"], [5, 1, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl::inv_permute_list"], [5, 1, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl::offset_dim_list"], [5, 1, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl::permute_list"], [5, 1, 1, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb", "permute_pooled_embs_cpu_impl::pooled_embs"], [5, 0, 1, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_cpu"], [5, 1, 1, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_cpu::inv_offset_dim_list"], [5, 1, 1, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_cpu::inv_permute_list"], [5, 1, 1, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_cpu::offset_dim_list"], [5, 1, 1, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_cpu::permute_list"], [5, 1, 1, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_cpu::pooled_embs"], [5, 0, 1, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_gpu"], [5, 1, 1, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_gpu::inv_offset_dim_list"], [5, 1, 1, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_gpu::inv_permute_list"], [5, 1, 1, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_gpu::offset_dim_list"], [5, 1, 1, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_gpu::permute_list"], [5, 1, 1, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE", "permute_pooled_embs_split_gpu::pooled_embs"], [0, 0, 1, "_CPPv423pruned_array_lookup_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cpu"], [0, 1, 1, "_CPPv423pruned_array_lookup_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cpu::index_remappings"], [0, 1, 1, "_CPPv423pruned_array_lookup_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cpu::index_remappings_offsets"], [0, 1, 1, "_CPPv423pruned_array_lookup_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cpu::indices"], [0, 1, 1, "_CPPv423pruned_array_lookup_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cpu::offsets"], [0, 0, 1, "_CPPv424pruned_array_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cuda"], [0, 1, 1, "_CPPv424pruned_array_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cuda::index_remappings"], [0, 1, 1, "_CPPv424pruned_array_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cuda::index_remappings_offsets"], [0, 1, 1, "_CPPv424pruned_array_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cuda::indices"], [0, 1, 1, "_CPPv424pruned_array_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_array_lookup_cuda::offsets"], [0, 0, 1, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_insert_unweighted_cpu"], [0, 1, 1, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_insert_unweighted_cpu::dense_indices"], [0, 1, 1, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_insert_unweighted_cpu::hash_table"], [0, 1, 1, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_insert_unweighted_cpu::hash_table_offsets"], [0, 1, 1, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_insert_unweighted_cpu::indices"], [0, 1, 1, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_insert_unweighted_cpu::offsets"], [0, 0, 1, "_CPPv426pruned_hashmap_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_cuda"], [0, 1, 1, "_CPPv426pruned_hashmap_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_cuda::hash_table"], [0, 1, 1, "_CPPv426pruned_hashmap_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_cuda::hash_table_offsets"], [0, 1, 1, "_CPPv426pruned_hashmap_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_cuda::indices"], [0, 1, 1, "_CPPv426pruned_hashmap_lookup_cuda6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_cuda::offsets"], [0, 0, 1, "_CPPv436pruned_hashmap_lookup_unweighted_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_unweighted_cpu"], [0, 1, 1, "_CPPv436pruned_hashmap_lookup_unweighted_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_unweighted_cpu::hash_table"], [0, 1, 1, "_CPPv436pruned_hashmap_lookup_unweighted_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_unweighted_cpu::hash_table_offsets"], [0, 1, 1, "_CPPv436pruned_hashmap_lookup_unweighted_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_unweighted_cpu::indices"], [0, 1, 1, "_CPPv436pruned_hashmap_lookup_unweighted_cpu6Tensor6Tensor6Tensor6Tensor", "pruned_hashmap_lookup_unweighted_cpu::offsets"], [3, 0, 1, "_CPPv432recat_embedding_grad_output_cuda6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_cuda"], [3, 1, 1, "_CPPv432recat_embedding_grad_output_cuda6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_cuda::grad_output"], [3, 1, 1, "_CPPv432recat_embedding_grad_output_cuda6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_cuda::num_features_per_rank"], [3, 0, 1, "_CPPv446recat_embedding_grad_output_mixed_D_batch_cudaRK6TensorRK6TensorRK6Tensor", "recat_embedding_grad_output_mixed_D_batch_cuda"], [3, 1, 1, "_CPPv446recat_embedding_grad_output_mixed_D_batch_cudaRK6TensorRK6TensorRK6Tensor", "recat_embedding_grad_output_mixed_D_batch_cuda::cumsum_dim_sum_per_rank"], [3, 1, 1, "_CPPv446recat_embedding_grad_output_mixed_D_batch_cudaRK6TensorRK6TensorRK6Tensor", "recat_embedding_grad_output_mixed_D_batch_cuda::dim_sum_per_rank"], [3, 1, 1, "_CPPv446recat_embedding_grad_output_mixed_D_batch_cudaRK6TensorRK6TensorRK6Tensor", "recat_embedding_grad_output_mixed_D_batch_cuda::grad_output"], [3, 0, 1, "_CPPv439recat_embedding_grad_output_mixed_D_cpuRK6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_mixed_D_cpu"], [3, 1, 1, "_CPPv439recat_embedding_grad_output_mixed_D_cpuRK6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_mixed_D_cpu::dim_sum_per_rank"], [3, 1, 1, "_CPPv439recat_embedding_grad_output_mixed_D_cpuRK6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_mixed_D_cpu::grad_output"], [3, 0, 1, "_CPPv440recat_embedding_grad_output_mixed_D_cudaRK6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_mixed_D_cuda"], [3, 1, 1, "_CPPv440recat_embedding_grad_output_mixed_D_cudaRK6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_mixed_D_cuda::dim_sum_per_rank"], [3, 1, 1, "_CPPv440recat_embedding_grad_output_mixed_D_cudaRK6TensorRKNSt6vectorI7int64_tEE", "recat_embedding_grad_output_mixed_D_cuda::grad_output"], [8, 0, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda"], [8, 1, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::D_offsets"], [8, 1, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::buffer_ids"], [8, 1, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::cache_hash_size_cumsum"], [8, 1, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::dev_weights"], [8, 1, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::logical_table_ids"], [8, 1, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::lxu_cache_state"], [8, 1, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::lxu_cache_weights"], [8, 1, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::momentum1_dev"], [8, 1, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::momentum1_offsets"], [8, 1, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::momentum1_placements"], [8, 1, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::momentum1_uvm"], [8, 1, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::pruned_indices"], [8, 1, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::pruned_indices_offsets"], [8, 1, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::total_cache_hash_size"], [8, 1, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::uvm_weights"], [8, 1, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::weights_offsets"], [8, 1, 1, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t", "reset_weight_momentum_cuda::weights_placements"], [0, 0, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::B_offsets"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::D_offsets"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::dev_weights"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::eps"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::feature_requires_grad"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::gradient_clipping"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::hash_size_cumsum"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::indice_weights"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::indices"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::is_experimental"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::learning_rate"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::lxu_cache_locations"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::lxu_cache_weights"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::max_B"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::max_B_feature_rank"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::max_D"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::max_gradient"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::momentum1_dev"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::momentum1_offsets"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::momentum1_placements"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::momentum1_uvm"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::offsets"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::output_dtype"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::placeholder_autograd_tensor"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::pooling_mode"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::stochastic_rounding"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::total_D"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::total_hash_size_bits"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::use_homogeneous_placements"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::use_uniq_cache_locations_bwd"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::uvm_weights"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::vbe_B_offsets_rank_per_feature"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::vbe_output_offsets_feature_rank"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::vbe_output_size"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::weights_offsets"], [0, 1, 1, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adagrad_function::weights_placements"], [0, 0, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::B_offsets"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::D_offsets"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::beta1"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::beta2"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::dev_weights"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::eps"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::feature_requires_grad"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::gradient_clipping"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::hash_size_cumsum"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::indice_weights"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::indices"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::is_experimental"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::iter"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::learning_rate"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::lxu_cache_locations"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::lxu_cache_weights"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::max_B"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::max_B_feature_rank"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::max_D"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::max_gradient"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::momentum1_dev"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::momentum1_offsets"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::momentum1_placements"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::momentum1_uvm"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::momentum2_dev"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::momentum2_offsets"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::momentum2_placements"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::momentum2_uvm"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::offsets"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::output_dtype"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::placeholder_autograd_tensor"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::pooling_mode"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::stochastic_rounding"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::total_D"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::total_hash_size_bits"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::use_homogeneous_placements"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::use_uniq_cache_locations_bwd"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::uvm_weights"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::vbe_B_offsets_rank_per_feature"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::vbe_output_offsets_feature_rank"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::vbe_output_size"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::weight_decay"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::weights_offsets"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_adam_function::weights_placements"], [0, 0, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::B_offsets"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::D_offsets"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::dev_weights"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::eps"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::feature_requires_grad"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::gradient_clipping"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::hash_size_cumsum"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::indice_weights"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::indices"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::is_experimental"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::learning_rate"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::lxu_cache_locations"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::lxu_cache_weights"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::max_B"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::max_B_feature_rank"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::max_D"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::max_gradient"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::momentum1_dev"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::momentum1_offsets"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::momentum1_placements"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::momentum1_uvm"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::offsets"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::output_dtype"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::placeholder_autograd_tensor"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::pooling_mode"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::stochastic_rounding"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::total_D"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::total_hash_size_bits"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::use_homogeneous_placements"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::use_uniq_cache_locations_bwd"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::uvm_weights"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::vbe_B_offsets_rank_per_feature"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::vbe_output_offsets_feature_rank"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::vbe_output_size"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::weight_decay"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::weight_decay_mode"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::weights_offsets"], [0, 1, 1, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_function::weights_placements"], [0, 0, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::B_offsets"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::D_offsets"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::adjustment_iter"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::adjustment_ub"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::counter_halflife"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::dev_weights"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::eps"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::feature_requires_grad"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::grad_sum_decay"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::gradient_clipping"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::hash_size_cumsum"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::indice_weights"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::indices"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::is_experimental"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::is_tail_id_thresh_ratio"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::iter"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::learning_rate"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::learning_rate_mode"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::lower_bound"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::lxu_cache_locations"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::lxu_cache_weights"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::max_B"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::max_B_feature_rank"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::max_D"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::max_counter"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::max_gradient"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::momentum1_dev"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::momentum1_offsets"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::momentum1_placements"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::momentum1_uvm"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::offsets"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::output_dtype"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::placeholder_autograd_tensor"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::pooling_mode"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::prev_iter_dev"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::prev_iter_offsets"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::prev_iter_placements"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::prev_iter_uvm"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::regularization_mode"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::row_counter_dev"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::row_counter_offsets"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::row_counter_placements"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::row_counter_uvm"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::stochastic_rounding"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::tail_id_threshold"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::total_D"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::total_hash_size_bits"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::use_homogeneous_placements"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::use_uniq_cache_locations_bwd"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::uvm_weights"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::vbe_B_offsets_rank_per_feature"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::vbe_output_offsets_feature_rank"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::vbe_output_size"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::weight_decay"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::weight_decay_mode"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::weight_norm_coefficient"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::weights_offsets"], [0, 1, 1, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function::weights_placements"], [0, 0, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::B_offsets"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::D_offsets"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::dev_weights"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::eps"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::feature_requires_grad"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::gradient_clipping"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::hash_size_cumsum"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::indice_weights"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::indices"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::is_experimental"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::learning_rate"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::lxu_cache_locations"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::lxu_cache_weights"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::max_B"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::max_B_feature_rank"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::max_D"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::max_gradient"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::momentum1_dev"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::momentum1_offsets"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::momentum1_placements"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::momentum1_uvm"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::offsets"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::output_dtype"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::placeholder_autograd_tensor"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::pooling_mode"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::stochastic_rounding"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::total_D"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::total_hash_size_bits"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::use_homogeneous_placements"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::use_uniq_cache_locations_bwd"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::uvm_weights"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::vbe_B_offsets_rank_per_feature"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::vbe_output_offsets_feature_rank"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::vbe_output_size"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::weight_decay"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::weight_decay_mode"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::weights_offsets"], [0, 1, 1, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function::weights_placements"], [0, 0, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::B_offsets"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::D_offsets"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::dev_weights"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::feature_requires_grad"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::gradient_clipping"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::hash_size_cumsum"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::indice_weights"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::indices"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::is_experimental"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::learning_rate"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::lxu_cache_locations"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::lxu_cache_weights"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::max_B"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::max_B_feature_rank"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::max_D"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::max_gradient"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::offsets"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::output_dtype"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::placeholder_autograd_tensor"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::pooling_mode"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::stochastic_rounding"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::total_D"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::total_hash_size_bits"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::use_homogeneous_placements"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::use_uniq_cache_locations_bwd"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::uvm_weights"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::vbe_B_offsets_rank_per_feature"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::vbe_output_offsets_feature_rank"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::vbe_output_size"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::weights_offsets"], [0, 1, 1, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_approx_sgd_function::weights_placements"], [0, 0, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::B_offsets"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::D_offsets"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::beta1"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::beta2"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::dev_weights"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::eps"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::feature_requires_grad"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::gradient_clipping"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::hash_size_cumsum"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::indice_weights"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::indices"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::is_experimental"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::iter"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::learning_rate"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::lxu_cache_locations"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::lxu_cache_weights"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::max_B"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::max_B_feature_rank"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::max_D"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::max_gradient"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::momentum1_dev"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::momentum1_offsets"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::momentum1_placements"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::momentum1_uvm"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::momentum2_dev"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::momentum2_offsets"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::momentum2_placements"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::momentum2_uvm"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::offsets"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::output_dtype"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::placeholder_autograd_tensor"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::pooling_mode"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::stochastic_rounding"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::total_D"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::total_hash_size_bits"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::use_homogeneous_placements"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::use_uniq_cache_locations_bwd"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::uvm_weights"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::vbe_B_offsets_rank_per_feature"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::vbe_output_offsets_feature_rank"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::vbe_output_size"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::weight_decay"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::weights_offsets"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lamb_function::weights_placements"], [0, 0, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::B_offsets"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::D_offsets"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::dev_weights"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::eta"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::feature_requires_grad"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::gradient_clipping"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::hash_size_cumsum"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::indice_weights"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::indices"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::is_experimental"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::learning_rate"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::lxu_cache_locations"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::lxu_cache_weights"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::max_B"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::max_B_feature_rank"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::max_D"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::max_gradient"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::momentum"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::momentum1_dev"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::momentum1_offsets"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::momentum1_placements"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::momentum1_uvm"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::offsets"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::output_dtype"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::placeholder_autograd_tensor"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::pooling_mode"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::stochastic_rounding"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::total_D"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::total_hash_size_bits"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::use_homogeneous_placements"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::use_uniq_cache_locations_bwd"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::uvm_weights"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::vbe_B_offsets_rank_per_feature"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::vbe_output_offsets_feature_rank"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::vbe_output_size"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::weight_decay"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::weights_offsets"], [0, 1, 1, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_lars_sgd_function::weights_placements"], [0, 0, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::B_offsets"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::D_offsets"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::dev_weights"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::feature_requires_grad"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::hash_size_cumsum"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::indice_weights"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::indices"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::is_experimental"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::lxu_cache_locations"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::lxu_cache_weights"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::max_B"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::max_B_feature_rank"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::max_D"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::offsets"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::output_dtype"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::placeholder_autograd_tensor"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::pooling_mode"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::total_D"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::total_hash_size"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::total_hash_size_bits"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::total_unique_indices"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::use_homogeneous_placements"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::use_uniq_cache_locations_bwd"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::uvm_weights"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::vbe_B_offsets_rank_per_feature"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::vbe_output_offsets_feature_rank"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::vbe_output_size"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::weights_offsets"], [0, 1, 1, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_none_function::weights_placements"], [0, 0, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::B_offsets"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::D_offsets"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::beta1"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::beta2"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::dev_weights"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::eps"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::feature_requires_grad"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::gradient_clipping"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::hash_size_cumsum"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::indice_weights"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::indices"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::is_experimental"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::iter"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::learning_rate"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::lxu_cache_locations"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::lxu_cache_weights"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::max_B"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::max_B_feature_rank"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::max_D"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::max_gradient"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::momentum1_dev"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::momentum1_offsets"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::momentum1_placements"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::momentum1_uvm"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::momentum2_dev"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::momentum2_offsets"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::momentum2_placements"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::momentum2_uvm"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::offsets"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::output_dtype"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::placeholder_autograd_tensor"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::pooling_mode"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::stochastic_rounding"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::total_D"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::total_hash_size_bits"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::use_homogeneous_placements"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::use_uniq_cache_locations_bwd"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::uvm_weights"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::vbe_B_offsets_rank_per_feature"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::vbe_output_offsets_feature_rank"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::vbe_output_size"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::weight_decay"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::weights_offsets"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_adam_function::weights_placements"], [0, 0, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::B_offsets"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::D_offsets"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::beta1"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::beta2"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::dev_weights"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::eps"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::feature_requires_grad"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::gradient_clipping"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::hash_size_cumsum"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::indice_weights"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::indices"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::is_experimental"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::iter"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::learning_rate"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::lxu_cache_locations"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::lxu_cache_weights"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::max_B"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::max_B_feature_rank"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::max_D"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::max_gradient"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::momentum1_dev"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::momentum1_offsets"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::momentum1_placements"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::momentum1_uvm"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::momentum2_dev"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::momentum2_offsets"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::momentum2_placements"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::momentum2_uvm"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::offsets"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::output_dtype"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::placeholder_autograd_tensor"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::pooling_mode"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::stochastic_rounding"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::total_D"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::total_hash_size_bits"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::use_homogeneous_placements"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::use_uniq_cache_locations_bwd"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::uvm_weights"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::vbe_B_offsets_rank_per_feature"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::vbe_output_offsets_feature_rank"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::vbe_output_size"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::weight_decay"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::weights_offsets"], [0, 1, 1, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_partial_rowwise_lamb_function::weights_placements"], [0, 0, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::B_offsets"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::D_offsets"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::dev_weights"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::eps"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::feature_requires_grad"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::gradient_clipping"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::hash_size_cumsum"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::indice_weights"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::indices"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::is_experimental"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::learning_rate"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::lxu_cache_locations"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::lxu_cache_weights"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::max_B"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::max_B_feature_rank"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::max_D"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::max_gradient"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::max_norm"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::momentum1_dev"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::momentum1_offsets"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::momentum1_placements"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::momentum1_uvm"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::offsets"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::output_dtype"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::placeholder_autograd_tensor"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::pooling_mode"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::stochastic_rounding"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::total_D"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::total_hash_size_bits"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::use_homogeneous_placements"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::use_uniq_cache_locations_bwd"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::uvm_weights"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::vbe_B_offsets_rank_per_feature"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::vbe_output_offsets_feature_rank"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::vbe_output_size"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::weight_decay"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::weight_decay_mode"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::weights_offsets"], [0, 1, 1, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_function::weights_placements"], [0, 0, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::B_offsets"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::D_offsets"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::adjustment_iter"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::adjustment_ub"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::counter_halflife"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::dev_weights"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::eps"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::feature_requires_grad"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::grad_sum_decay"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::gradient_clipping"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::hash_size_cumsum"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::indice_weights"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::indices"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::is_experimental"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::is_tail_id_thresh_ratio"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::iter"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::learning_rate"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::learning_rate_mode"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::lower_bound"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::lxu_cache_locations"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::lxu_cache_weights"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::max_B"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::max_B_feature_rank"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::max_D"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::max_counter"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::max_gradient"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::momentum1_dev"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::momentum1_offsets"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::momentum1_placements"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::momentum1_uvm"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::offsets"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::output_dtype"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::placeholder_autograd_tensor"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::pooling_mode"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::prev_iter_dev"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::prev_iter_offsets"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::prev_iter_placements"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::prev_iter_uvm"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::regularization_mode"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::row_counter_dev"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::row_counter_offsets"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::row_counter_placements"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::row_counter_uvm"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::stochastic_rounding"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::tail_id_threshold"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::total_D"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::total_hash_size_bits"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::use_homogeneous_placements"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::use_uniq_cache_locations_bwd"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::uvm_weights"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::vbe_B_offsets_rank_per_feature"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::vbe_output_offsets_feature_rank"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::vbe_output_size"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::weight_decay"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::weight_decay_mode"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::weight_norm_coefficient"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::weights_offsets"], [0, 1, 1, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function::weights_placements"], [0, 0, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::B_offsets"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::D_offsets"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::dev_weights"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::eps"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::feature_requires_grad"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::gradient_clipping"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::hash_size_cumsum"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::indice_weights"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::indices"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::is_experimental"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::learning_rate"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::lxu_cache_locations"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::lxu_cache_weights"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::max_B"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::max_B_feature_rank"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::max_D"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::max_gradient"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::momentum1_dev"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::momentum1_offsets"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::momentum1_placements"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::momentum1_uvm"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::offsets"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::output_dtype"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::placeholder_autograd_tensor"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::pooling_mode"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::stochastic_rounding"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::total_D"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::total_hash_size_bits"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::use_homogeneous_placements"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::use_uniq_cache_locations_bwd"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::uvm_weights"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::vbe_B_offsets_rank_per_feature"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::vbe_output_offsets_feature_rank"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::vbe_output_size"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::weight_decay"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::weight_decay_mode"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::weights_offsets"], [0, 1, 1, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function::weights_placements"], [0, 0, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::B_offsets"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::D_offsets"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::dev_weights"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::eps"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::feature_requires_grad"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::gradient_clipping"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::hash_size_cumsum"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::indice_weights"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::indices"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::is_experimental"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::iter"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::learning_rate"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::lxu_cache_locations"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::lxu_cache_weights"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::max_B"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::max_B_feature_rank"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::max_D"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::max_gradient"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::momentum1_dev"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::momentum1_offsets"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::momentum1_placements"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::momentum1_uvm"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::offsets"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::output_dtype"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::placeholder_autograd_tensor"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::pooling_mode"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::stochastic_rounding"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::total_D"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::total_hash_size_bits"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::use_homogeneous_placements"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::use_uniq_cache_locations_bwd"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::uvm_weights"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::vbe_B_offsets_rank_per_feature"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::vbe_output_offsets_feature_rank"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::vbe_output_size"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::weight_decay"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::weights_offsets"], [0, 1, 1, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function::weights_placements"], [0, 0, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::B_offsets"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::D_offsets"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::dev_weights"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::feature_requires_grad"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::gradient_clipping"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::hash_size_cumsum"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::indice_weights"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::indices"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::is_experimental"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::learning_rate"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::lxu_cache_locations"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::lxu_cache_weights"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::max_B"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::max_B_feature_rank"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::max_D"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::max_gradient"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::offsets"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::output_dtype"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::placeholder_autograd_tensor"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::pooling_mode"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::stochastic_rounding"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::total_D"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::total_hash_size_bits"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::use_homogeneous_placements"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::use_uniq_cache_locations_bwd"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::uvm_weights"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::vbe_B_offsets_rank_per_feature"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::vbe_output_offsets_feature_rank"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::vbe_output_size"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::weights_offsets"], [0, 1, 1, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb", "split_embedding_codegen_lookup_sgd_function::weights_placements"], [1, 0, 1, "_CPPv421tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE", "tbe_input_combine_cpu"], [1, 1, 1, "_CPPv421tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE", "tbe_input_combine_cpu::include_last_offsets"], [1, 1, 1, "_CPPv421tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE", "tbe_input_combine_cpu::indices_list"], [1, 1, 1, "_CPPv421tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE", "tbe_input_combine_cpu::offsets_list"], [1, 1, 1, "_CPPv421tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE", "tbe_input_combine_cpu::per_sample_weights"], [4, 0, 1, "_CPPv419uvm_cuda_mem_adviseRK6Tensor7int64_t", "uvm_cuda_mem_advise"], [4, 1, 1, "_CPPv419uvm_cuda_mem_adviseRK6Tensor7int64_t", "uvm_cuda_mem_advise::cuda_memory_advise"], [4, 1, 1, "_CPPv419uvm_cuda_mem_adviseRK6Tensor7int64_t", "uvm_cuda_mem_advise::self"], [4, 0, 1, "_CPPv427uvm_cuda_mem_prefetch_asyncRK6TensorN3c108optionalI6TensorEE", "uvm_cuda_mem_prefetch_async"], [4, 1, 1, "_CPPv427uvm_cuda_mem_prefetch_asyncRK6TensorN3c108optionalI6TensorEE", "uvm_cuda_mem_prefetch_async::device_t"], [4, 1, 1, "_CPPv427uvm_cuda_mem_prefetch_asyncRK6TensorN3c108optionalI6TensorEE", "uvm_cuda_mem_prefetch_async::self"], [4, 0, 1, "_CPPv424uvm_mem_advice_dont_forkRK6Tensor", "uvm_mem_advice_dont_fork"], [4, 1, 1, "_CPPv424uvm_mem_advice_dont_forkRK6Tensor", "uvm_mem_advice_dont_fork::self"], [4, 0, 1, "_CPPv411uvm_storageRK6Tensor", "uvm_storage"], [4, 1, 1, "_CPPv411uvm_storageRK6Tensor", "uvm_storage::self"], [4, 0, 1, "_CPPv410uvm_to_cpuRK6Tensor", "uvm_to_cpu"], [4, 1, 1, "_CPPv410uvm_to_cpuRK6Tensor", "uvm_to_cpu::self"], [4, 0, 1, "_CPPv416uvm_to_cpu_cloneRK6Tensor", "uvm_to_cpu_clone"], [4, 1, 1, "_CPPv416uvm_to_cpu_cloneRK6Tensor", "uvm_to_cpu_clone::self"], [4, 0, 1, "_CPPv413uvm_to_deviceRK6TensorRK6Tensor", "uvm_to_device"], [4, 1, 1, "_CPPv413uvm_to_deviceRK6TensorRK6Tensor", "uvm_to_device::prototype"], [4, 1, 1, "_CPPv413uvm_to_deviceRK6TensorRK6Tensor", "uvm_to_device::self"], [15, 3, 0, "-", "fbgemm_gpu"]], "fbgemm_gpu.split_table_batched_embeddings_ops": [[15, 4, 1, "", "SplitTableBatchedEmbeddingBagsCodegen"]], "torch.ops.fbgemm": [[14, 4, 1, "", "batched_dense_vec_jagged_2d_mul"], [14, 4, 1, "", "dense_to_jagged"], [14, 4, 1, "", "jagged_1d_to_dense"], [14, 4, 1, "", "jagged_2d_to_dense"], [14, 4, 1, "", "jagged_dense_dense_elementwise_add_jagged_output"], [14, 4, 1, "", "jagged_dense_elementwise_add"], [14, 4, 1, "", "jagged_dense_elementwise_add_jagged_output"], [14, 4, 1, "", "jagged_dense_elementwise_mul"], [14, 4, 1, "", "jagged_to_padded_dense"], [14, 4, 1, "", "stacked_jagged_1d_to_dense"], [14, 4, 1, "", "stacked_jagged_2d_to_dense"]]}, "objtypes": {"0": "cpp:function", "1": "cpp:functionParam", "2": "cpp:templateParam", "3": "py:module", "4": "py:function"}, "objnames": {"0": ["cpp", "function", "C++ function"], "1": ["cpp", "functionParam", "C++ function parameter"], "2": ["cpp", "templateParam", "C++ template parameter"], "3": ["py", "module", "Python module"], "4": ["py", "function", "Python function"]}, "titleterms": {"embed": [0, 5, 8, 15], "oper": [0, 1, 2, 3, 4, 5, 6, 7, 8, 14, 15], "cuda": [0, 2, 3, 4, 6, 7, 9, 11, 12], "cpu": [0, 2, 3, 6, 7, 9, 11], "combin": 1, "input": 1, "jag": [2, 14], "tensor": [2, 14], "layout": 3, "transform": 3, "memori": 4, "pool": 5, "merg": 5, "permut": 5, "quantiz": 6, "spars": 7, "data": [7, 18], "tabl": [8, 15, 17, 18, 19, 20, 21, 22], "batch": [8, 15], "build": [9, 10, 17], "instruct": [9, 11], "set": [9, 10, 11], "up": [9, 10, 11], "an": [9, 23], "isol": 9, "environ": [9, 10, 11], "instal": [9, 11, 24], "miniconda": 9, "conda": [9, 11], "onli": [9, 11], "docker": [9, 11], "imag": [9, 19, 20], "cudnn": 9, "rocm": [9, 11, 12], "miopen": 9, "tool": 9, "c": [9, 10, 13], "compil": 9, "other": 9, "pytorch": [9, 11], "through": [9, 11], "pip": [9, 11], "post": [9, 11], "check": [9, 11], "fbgemm_gpu": [9, 10, 11, 12, 13], "packag": [9, 11], "prepar": 9, "The": [9, 19], "process": 9, "For": 9, "develop": 9, "undefin": [9, 11], "symbol": [9, 11], "glibc": 9, "version": 9, "compat": 9, "contribut": 10, "document": [10, 13, 22, 23], "api": [10, 13], "toolchain": 10, "deploy": 10, "preview": 10, "gener": [10, 13, 18], "guidelin": 10, "ad": 10, "python": [10, 11, 13], "code": [10, 19], "nvidia": 11, "driver": 11, "contain": 11, "runtim": 11, "amdgpu": 11, "librari": 11, "public": 11, "pypi": 11, "test": 12, "variant": 12, "benchmark": 12, "welcom": 13, "fbgemm": 13, "": 13, "info": 13, "tbe": 15, "changelog": 16, "configur": 17, "project": 17, "wide": 17, "html": 17, "theme": [17, 23], "option": [17, 18, 20], "base": 17, "toc": 17, "context": 17, "page": 17, "level": [17, 19, 20], "how": 17, "content": [17, 18, 19, 20, 21, 22], "mod": 18, "test_py_modul": 18, "index": 18, "paramet": 18, "arg": 18, "paragraph": [19, 22], "markup": 19, "inlin": 19, "math": 19, "meta": 19, "block": 19, "liter": 19, "line": 19, "quot": 19, "doctest": 19, "emphas": 19, "number": [19, 20], "sidebar": 19, "ch": 19, "ien": 19, "creativ": 19, "A": 19, "exampl": [19, 21], "refer": 19, "footnot": 19, "citat": 19, "glossari": 19, "target": 19, "direct": 19, "center": 19, "text": 19, "figur": 19, "admonit": 19, "And": 19, "wai": 19, "topic": 19, "rubric": 19, "titl": 19, "replac": 19, "compound": 19, "download": [19, 24], "link": 19, "list": 20, "enumer": 20, "definit": 20, "field": 20, "bullet": 20, "second": 20, "But": 20, "deeper": 20, "down": 20, "rabbit": 20, "hole": 20, "hlist": 20, "grid": 20, "giant": 20, "can": 20, "have": 20, "caption": [20, 23], "like": 20, "thi": [20, 23], "one": 20, "i": [20, 23], "long": [21, 23], "sticki": 21, "nav": 21, "menu": [21, 23], "1": 21, "2": [21, 22], "3": 21, "4": 21, "5": 21, "6": 21, "7": 21, "8": 21, "9": 21, "10": 21, "11": 21, "12": 21, "13": 21, "14": 21, "15": 21, "16": 21, "17": 21, "18": 21, "19": 21, "20": 21, "submenu": 21, "subsubmenu": 21, "structur": 22, "element": 22, "section": 22, "subsect": 22, "subsubsect": 22, "demo": 23, "incredibli": 23, "via": 24, "git": 24}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.intersphinx": 1, "sphinx": 57}, "alltitles": {"Embedding Operators": [[0, "embedding-operators"]], "CUDA Operators": [[0, "cuda-operators"], [2, "cuda-operators"], [3, "cuda-operators"], [6, "cuda-operators"], [7, "cuda-operators"]], "CPU Operators": [[0, "cpu-operators"], [2, "cpu-operators"], [3, "cpu-operators"], [6, "cpu-operators"], [7, "cpu-operators"]], "Combine Input Operators": [[1, "combine-input-operators"]], "Jagged Tensor Operators": [[2, "jagged-tensor-operators"], [14, "module-fbgemm_gpu"]], "Layout Transformation Operators": [[3, "layout-transformation-operators"]], "CUDA Memory Operators": [[4, "cuda-memory-operators"]], "Pooled Embeddings Operators": [[5, "pooled-embeddings-operators"]], "Merge Operators": [[5, "merge-operators"]], "Permutation Operators": [[5, "permutation-operators"]], "Quantization Operators": [[6, "quantization-operators"]], "Sparse Data Operators": [[7, "sparse-data-operators"]], "Table Batched Embedding Operators": [[8, "table-batched-embedding-operators"]], "Build Instructions": [[9, "build-instructions"]], "Set Up an Isolated Build Environment": [[9, "set-up-an-isolated-build-environment"]], "Install Miniconda": [[9, "install-miniconda"]], "Set Up the Conda Environment": [[9, "set-up-the-conda-environment"]], "Set Up for CPU-Only Build": [[9, "set-up-for-cpu-only-build"]], "Set Up for CUDA Build": [[9, "set-up-for-cuda-build"]], "CUDA Docker Image": [[9, "cuda-docker-image"]], "Install CUDA": [[9, "install-cuda"]], "Install cuDNN": [[9, "install-cudnn"]], "Set Up for ROCm Build": [[9, "set-up-for-rocm-build"]], "ROCm Docker Image": [[9, "rocm-docker-image"]], "Install ROCm": [[9, "install-rocm"]], "Install MIOpen": [[9, "install-miopen"]], "Install the Build Tools": [[9, "install-the-build-tools"]], "C/C++ Compiler": [[9, "c-c-compiler"]], "Other Build Tools": [[9, "other-build-tools"]], "Install PyTorch": [[9, "install-pytorch"], [11, "install-pytorch"]], "Installation Through Conda": [[9, "installation-through-conda"]], "Installation Through PyTorch PIP": [[9, "installation-through-pytorch-pip"]], "Post-Install Checks": [[9, "post-install-checks"]], "Build the FBGEMM_GPU Package": [[9, "build-the-fbgemm-gpu-package"]], "Preparing the Build": [[9, "preparing-the-build"]], "The Build Process": [[9, "the-build-process"]], "CUDA Build": [[9, "cuda-build"]], "ROCm Build": [[9, "rocm-build"]], "CPU-Only Build": [[9, "cpu-only-build"]], "Post-Build Checks (For Developers)": [[9, "post-build-checks-for-developers"]], "Undefined Symbols Check": [[9, "undefined-symbols-check"]], "GLIBC Version Compatibility Check": [[9, "glibc-version-compatibility-check"]], "Contributing Documentation": [[10, "contributing-documentation"]], "Building the API Documentation": [[10, "building-the-api-documentation"]], "Set Up Build Environment": [[10, "set-up-build-environment"]], "Build FBGEMM_GPU": [[10, "build-fbgemm-gpu"]], "Set Up Documentation Toolchain": [[10, "set-up-documentation-toolchain"]], "Build the Documentation": [[10, "build-the-documentation"]], "Deployment Preview": [[10, "deployment-preview"]], "General Documentation Guidelines": [[10, "general-documentation-guidelines"]], "Adding Documentation to Python Code": [[10, "adding-documentation-to-python-code"]], "Adding Documentation to C++ Code": [[10, "adding-documentation-to-c-code"]], "Installation Instructions": [[11, "installation-instructions"]], "Set Up CPU-Only Environment": [[11, "set-up-cpu-only-environment"]], "Set Up CUDA Environment": [[11, "set-up-cuda-environment"]], "Install NVIDIA Drivers": [[11, "install-nvidia-drivers"]], "Set Up the Docker Container and Conda Environment": [[11, "set-up-the-docker-container-and-conda-environment"], [11, "id1"]], "Install the CUDA Runtime": [[11, "install-the-cuda-runtime"]], "Set Up ROCm Environment": [[11, "set-up-rocm-environment"]], "Install AMDGPU Drivers": [[11, "install-amdgpu-drivers"]], "Install Python Libraries": [[11, "install-python-libraries"]], "Install the FBGEMM_GPU Package": [[11, "install-the-fbgemm-gpu-package"]], "Install through PyTorch PIP": [[11, "install-through-pytorch-pip"]], "Install through Public PyPI": [[11, "install-through-public-pypi"]], "Post-Installation Checks": [[11, "post-installation-checks"]], "Undefined Symbols": [[11, "undefined-symbols"]], "Testing FBGEMM_GPU": [[12, "testing-fbgemm-gpu"]], "FBGEMM_GPU Tests": [[12, "fbgemm-gpu-tests"]], "Testing with the CUDA Variant": [[12, "testing-with-the-cuda-variant"]], "Testing with the ROCm Variant": [[12, "testing-with-the-rocm-variant"]], "FBGEMM_GPU Benchmarks": [[12, "fbgemm-gpu-benchmarks"]], "Welcome to FBGEMM\u2019s documentation!": [[13, "welcome-to-fbgemm-s-documentation"]], "FBGEMM_GPU General Info": [[13, null]], "FBGEMM_GPU Python API": [[13, null]], "FBGEMM_GPU C++ API": [[13, null]], "Table Batched Embedding (TBE) Operators": [[15, "module-fbgemm_gpu"]], "Changelog": [[16, "changelog"]], "Configuration": [[17, "configuration"]], "Project-wide Configuration": [[17, "project-wide-configuration"]], "HTML Theme Options": [[17, "html-theme-options"]], "Base options": [[17, "base-options"]], "TOC Options": [[17, "toc-options"]], "HTML Context Options": [[17, "html-context-options"]], "Page-level Configuration": [[17, "page-level-configuration"]], "How the Table of Contents builds": [[17, "how-the-table-of-contents-builds"]], ":mod:`test_py_module`": [[18, "mod-test-py-module"]], "Table of Contents": [[18, "table-of-contents"], [19, "table-of-contents"], [20, "table-of-contents"], [21, "table-of-contents"], [22, "table-of-contents"]], "Generated Index": [[18, "generated-index"]], "Optional parameter args": [[18, "optional-parameter-args"]], "Data": [[18, "data"]], "Paragraph Level Markup": [[19, "paragraph-level-markup"]], "Inline Markup": [[19, "inline-markup"]], "Math": [[19, "math"]], "Meta": [[19, "meta"]], "Blocks": [[19, "blocks"]], "Literal Blocks": [[19, "literal-blocks"]], "Line Blocks": [[19, "line-blocks"]], "Block Quotes": [[19, "block-quotes"]], "Doctest Blocks": [[19, "doctest-blocks"]], "Code Blocks": [[19, "code-blocks"]], "Emphasized lines with line numbers": [[19, "emphasized-lines-with-line-numbers"]], "Sidebar": [[19, "sidebar"]], "Ch\u2019ien / The Creative": [[19, null]], "Code with Sidebar": [[19, "code-with-sidebar"]], "A code example": [[19, null]], "References": [[19, "references"]], "Footnotes": [[19, "footnotes"]], "Citations": [[19, "citations"]], "Glossary": [[19, "glossary"]], "Targets": [[19, "targets"]], "Directives": [[19, "directives"]], "Contents": [[19, "contents"]], "Centered text": [[19, "centered-text"]], "Images & Figures": [[19, "images-figures"]], "Images": [[19, "images"]], "Figures": [[19, "figures"]], "Admonitions": [[19, "admonitions"]], "And, by the way\u2026": [[19, null]], "Topics, Sidebars, and Rubrics": [[19, "topics-sidebars-and-rubrics"]], "Sidebar Title": [[19, null]], "Topic Title": [[19, null]], "Target Footnotes": [[19, "target-footnotes"]], "Replacement Text": [[19, "replacement-text"]], "Compound Paragraph": [[19, "compound-paragraph"]], "Download Links": [[19, "download-links"]], "Lists & Tables": [[20, "lists-tables"]], "Lists": [[20, "lists"]], "Enumerated Lists": [[20, "enumerated-lists"]], "Definition Lists": [[20, "definition-lists"]], "Option Lists": [[20, "option-lists"]], "Field list": [[20, "field-list"]], "Bullet Lists": [[20, "bullet-lists"]], "Second list level": [[20, "second-list-level"]], "But deeper down the rabbit hole": [[20, "but-deeper-down-the-rabbit-hole"]], "Hlists": [[20, "hlists"]], "Numbered List": [[20, "numbered-list"]], "Tables": [[20, "tables"]], "Grid Tables": [[20, "grid-tables"]], "Giant Tables": [[20, "giant-tables"]], "List Tables": [[20, "list-tables"]], "List tables can have captions like this one.": [[20, "id13"]], "This is a list table with images in it.": [[20, "id14"]], "Long Sticky Nav": [[21, "long-sticky-nav"]], "Example Menu 1": [[21, "example-menu-1"]], "Example Menu 2": [[21, "example-menu-2"]], "Example Menu 3": [[21, "example-menu-3"]], "Example Menu 4": [[21, "example-menu-4"]], "Example Menu 5": [[21, "example-menu-5"]], "Example Menu 6": [[21, "example-menu-6"]], "Example Menu 7": [[21, "example-menu-7"]], "Example Menu 8": [[21, "example-menu-8"]], "Example Menu 9": [[21, "example-menu-9"]], "Example Menu 10": [[21, "example-menu-10"]], "Example Menu 11": [[21, "example-menu-11"]], "Example Menu 12": [[21, "example-menu-12"]], "Example Menu 13": [[21, "example-menu-13"]], "Example Menu 14": [[21, "example-menu-14"]], "Example Menu 15": [[21, "example-menu-15"]], "Example Menu 16": [[21, "example-menu-16"]], "Example Menu 17": [[21, "example-menu-17"]], "Example Menu 18": [[21, "example-menu-18"]], "Example Menu 19": [[21, "example-menu-19"]], "Example Menu 20": [[21, "example-menu-20"]], "Example Submenu 1": [[21, "example-submenu-1"]], "Submenu 1": [[21, "submenu-1"], [21, "id2"]], "Subsubmenu 1": [[21, "subsubmenu-1"], [21, "id1"], [21, "id3"], [21, "id5"]], "Subsubmenu 2": [[21, "subsubmenu-2"]], "Submenu 2": [[21, "submenu-2"], [21, "id4"]], "Submenu 3": [[21, "submenu-3"], [21, "id6"]], "Submenu 4": [[21, "submenu-4"], [21, "id7"]], "Submenu 5": [[21, "submenu-5"], [21, "id8"]], "Example Submenu 2": [[21, "example-submenu-2"]], "Structural Elements": [[22, "structural-elements"]], "Document Section": [[22, "document-section"], [22, "id1"]], "Document Subsection": [[22, "document-subsection"], [22, "id2"]], "Document Subsubsection": [[22, "document-subsubsection"]], "Document Paragraph": [[22, "document-paragraph"]], "Structural Elements 2": [[22, "structural-elements-2"]], "Theme Documentation": [[23, null]], "Demo Documents": [[23, null]], "This is an incredibly long caption for a long menu": [[23, null]], "Installation": [[24, "installation"]], "Via Git or Download": [[24, "via-git-or-download"]]}, "indexentries": {"bounds_check_indices_cuda (c++ function)": [[0, "_CPPv425bounds_check_indices_cudaR6TensorR6TensorR6Tensor7int64_tR6TensorRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_t"]], "int_nbit_split_embedding_codegen_lookup_function (c++ function)": [[0, "_CPPv448int_nbit_split_embedding_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE"]], "int_nbit_split_embedding_codegen_lookup_function_cpu (c++ function)": [[0, "_CPPv452int_nbit_split_embedding_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEE"]], "int_nbit_split_embedding_uvm_caching_codegen_lookup_function (c++ function)": [[0, "_CPPv460int_nbit_split_embedding_uvm_caching_codegen_lookup_function6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE"]], "int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu (c++ function)": [[0, "_CPPv464int_nbit_split_embedding_uvm_caching_codegen_lookup_function_cpu6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor7int64_t7int64_t7int64_t7int64_t7int64_t7int64_t6Tensor6Tensor7int64_tN3c108optionalI6TensorEE7int64_tN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI7int64_tEEN3c108optionalI6TensorEEN3c108optionalI6TensorEEN3c108optionalI6TensorEE"]], "pruned_array_lookup_cpu (c++ function)": [[0, "_CPPv423pruned_array_lookup_cpu6Tensor6Tensor6Tensor6Tensor"]], "pruned_array_lookup_cuda (c++ function)": [[0, "_CPPv424pruned_array_lookup_cuda6Tensor6Tensor6Tensor6Tensor"]], "pruned_hashmap_insert_unweighted_cpu (c++ function)": [[0, "_CPPv436pruned_hashmap_insert_unweighted_cpu6Tensor6Tensor6Tensor6Tensor6Tensor"]], "pruned_hashmap_lookup_cuda (c++ function)": [[0, "_CPPv426pruned_hashmap_lookup_cuda6Tensor6Tensor6Tensor6Tensor"]], "pruned_hashmap_lookup_unweighted_cpu (c++ function)": [[0, "_CPPv436pruned_hashmap_lookup_unweighted_cpu6Tensor6Tensor6Tensor6Tensor"]], "split_embedding_codegen_lookup_adagrad_function (c++ function)": [[0, "_CPPv447split_embedding_codegen_lookup_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb"]], "split_embedding_codegen_lookup_adam_function (c++ function)": [[0, "_CPPv444split_embedding_codegen_lookup_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb"]], "split_embedding_codegen_lookup_approx_rowwise_adagrad_function (c++ function)": [[0, "_CPPv462split_embedding_codegen_lookup_approx_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb"]], "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_function (c++ function)": [[0, "_CPPv475split_embedding_codegen_lookup_approx_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb"]], "split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_function (c++ function)": [[0, "_CPPv480split_embedding_codegen_lookup_approx_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb"]], "split_embedding_codegen_lookup_approx_sgd_function (c++ function)": [[0, "_CPPv450split_embedding_codegen_lookup_approx_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb"]], "split_embedding_codegen_lookup_lamb_function (c++ function)": [[0, "_CPPv444split_embedding_codegen_lookup_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb"]], "split_embedding_codegen_lookup_lars_sgd_function (c++ function)": [[0, "_CPPv448split_embedding_codegen_lookup_lars_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6TensorddddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb"]], "split_embedding_codegen_lookup_none_function (c++ function)": [[0, "_CPPv444split_embedding_codegen_lookup_none_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6Tensor7int64_t7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb"]], "split_embedding_codegen_lookup_partial_rowwise_adam_function (c++ function)": [[0, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_adam_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb"]], "split_embedding_codegen_lookup_partial_rowwise_lamb_function (c++ function)": [[0, "_CPPv460split_embedding_codegen_lookup_partial_rowwise_lamb_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb"]], "split_embedding_codegen_lookup_rowwise_adagrad_function (c++ function)": [[0, "_CPPv455split_embedding_codegen_lookup_rowwise_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb"]], "split_embedding_codegen_lookup_rowwise_adagrad_with_counter_function (c++ function)": [[0, "_CPPv468split_embedding_codegen_lookup_rowwise_adagrad_with_counter_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensor6Tensorddd7int64_t7int64_t7int64_td7int64_t7int64_t7int64_tdd7int64_t7int64_tddK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb"]], "split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_function (c++ function)": [[0, "_CPPv473split_embedding_codegen_lookup_rowwise_adagrad_with_weight_decay_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb"]], "split_embedding_codegen_lookup_rowwise_weighted_adagrad_function (c++ function)": [[0, "_CPPv464split_embedding_codegen_lookup_rowwise_weighted_adagrad_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKb6Tensor6Tensor6Tensor6Tensorddd7int64_tK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb"]], "split_embedding_codegen_lookup_sgd_function (c++ function)": [[0, "_CPPv443split_embedding_codegen_lookup_sgd_functionRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorRK6TensorK7int64_tK7int64_tRK6TensorK7int64_tRK6TensorRK6TensorK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERK6TensorKbKdKbdK7int64_tRKN3c108optionalI6TensorEERKN3c108optionalI6TensorEERKN3c108optionalI6TensorEEK7int64_tK7int64_tK7int64_tKbKbKb"]], "padding_fused_tbe_input_combine_cpu (c++ function)": [[1, "_CPPv435padding_fused_tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE7int64_t"]], "tbe_input_combine_cpu (c++ function)": [[1, "_CPPv421tbe_input_combine_cpuRKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKNSt6vectorIN2at6TensorEEERKN2at6TensorE"]], "batched_dense_vec_jagged_2d_mul (c++ function)": [[2, "_CPPv431batched_dense_vec_jagged_2d_mulRK6TensorRK6TensorRK6Tensor"]], "dense_to_jagged (c++ function)": [[2, "_CPPv415dense_to_jaggedRK6TensorRKNSt6vectorI6TensorEEN3c108optionalIN2at6SymIntEEE"]], "jagged_1d_to_dense (c++ function)": [[2, "_CPPv418jagged_1d_to_dense6Tensor6TensorN3c106SymIntE7int64_t"]], "jagged_2d_to_dense (c++ function)": [[2, "_CPPv418jagged_2d_to_dense6Tensor6TensorN3c106SymIntE"]], "jagged_dense_elementwise_add (c++ function)": [[2, "_CPPv428jagged_dense_elementwise_addRK6TensorRKNSt6vectorI6TensorEERK6Tensor"]], "jagged_dense_elementwise_add_jagged_output (c++ function)": [[2, "_CPPv442jagged_dense_elementwise_add_jagged_outputRK6TensorRKNSt6vectorI6TensorEERK6Tensor"]], "jagged_dense_elementwise_add_jagged_output_cuda (c++ function)": [[2, "_CPPv447jagged_dense_elementwise_add_jagged_output_cudaRK6TensorRKNSt6vectorI6TensorEERK6Tensor"]], "jagged_dense_elementwise_mul (c++ function)": [[2, "_CPPv428jagged_dense_elementwise_mulRK6TensorRKNSt6vectorI6TensorEERK6Tensor"]], "jagged_to_padded_dense (c++ function)": [[2, "_CPPv422jagged_to_padded_denseRK6TensorRKNSt6vectorI6TensorEEKN3c1014SymIntArrayRefEKd"]], "jagged_to_padded_dense_forward (c++ function)": [[2, "_CPPv430jagged_to_padded_dense_forwardRK6TensorRKNSt6vectorI6TensorEEN3c1014SymIntArrayRefEKd"]], "recat_embedding_grad_output_cuda (c++ function)": [[3, "_CPPv432recat_embedding_grad_output_cuda6TensorRKNSt6vectorI7int64_tEE"]], "recat_embedding_grad_output_mixed_d_batch_cuda (c++ function)": [[3, "_CPPv446recat_embedding_grad_output_mixed_D_batch_cudaRK6TensorRK6TensorRK6Tensor"]], "recat_embedding_grad_output_mixed_d_cpu (c++ function)": [[3, "_CPPv439recat_embedding_grad_output_mixed_D_cpuRK6TensorRKNSt6vectorI7int64_tEE"]], "recat_embedding_grad_output_mixed_d_cuda (c++ function)": [[3, "_CPPv440recat_embedding_grad_output_mixed_D_cudaRK6TensorRKNSt6vectorI7int64_tEE"]], "is_uvm_tensor (c++ function)": [[4, "_CPPv413is_uvm_tensorRK6Tensor"]], "new_host_mapped_tensor (c++ function)": [[4, "_CPPv422new_host_mapped_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE"]], "new_managed_tensor (c++ function)": [[4, "_CPPv418new_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE"]], "new_managed_tensor_meta (c++ function)": [[4, "_CPPv423new_managed_tensor_metaRK6TensorRKNSt6vectorINSt7int64_tEEE"]], "new_unified_tensor (c++ function)": [[4, "_CPPv418new_unified_tensorRK6TensorRKNSt6vectorINSt7int64_tEEEb"]], "new_vanilla_managed_tensor (c++ function)": [[4, "_CPPv426new_vanilla_managed_tensorRK6TensorRKNSt6vectorINSt7int64_tEEE"]], "uvm_cuda_mem_advise (c++ function)": [[4, "_CPPv419uvm_cuda_mem_adviseRK6Tensor7int64_t"]], "uvm_cuda_mem_prefetch_async (c++ function)": [[4, "_CPPv427uvm_cuda_mem_prefetch_asyncRK6TensorN3c108optionalI6TensorEE"]], "uvm_mem_advice_dont_fork (c++ function)": [[4, "_CPPv424uvm_mem_advice_dont_forkRK6Tensor"]], "uvm_storage (c++ function)": [[4, "_CPPv411uvm_storageRK6Tensor"]], "uvm_to_cpu (c++ function)": [[4, "_CPPv410uvm_to_cpuRK6Tensor"]], "uvm_to_cpu_clone (c++ function)": [[4, "_CPPv416uvm_to_cpu_cloneRK6Tensor"]], "uvm_to_device (c++ function)": [[4, "_CPPv413uvm_to_deviceRK6TensorRK6Tensor"]], "all_to_one_device (c++ function)": [[5, "_CPPv417all_to_one_deviceNSt6vectorIN2at6TensorEEEN2at6DeviceE"]], "permute_pooled_embs_auto_grad (c++ function)": [[5, "_CPPv429permute_pooled_embs_auto_gradRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor"]], "permute_pooled_embs_auto_grad_cpu (c++ function)": [[5, "_CPPv433permute_pooled_embs_auto_grad_cpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor"]], "permute_pooled_embs_auto_grad_gpu (c++ function)": [[5, "_CPPv433permute_pooled_embs_auto_grad_gpuRK6TensorRK6TensorRK6TensorRK6TensorRK6Tensor"]], "permute_pooled_embs_auto_grad_split_cpu (c++ function)": [[5, "_CPPv439permute_pooled_embs_auto_grad_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE"]], "permute_pooled_embs_auto_grad_split_gpu (c++ function)": [[5, "_CPPv439permute_pooled_embs_auto_grad_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE"]], "permute_pooled_embs_cpu_impl (c++ function)": [[5, "_CPPv428permute_pooled_embs_cpu_implRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKb"]], "permute_pooled_embs_split_cpu (c++ function)": [[5, "_CPPv429permute_pooled_embs_split_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE"]], "permute_pooled_embs_split_gpu (c++ function)": [[5, "_CPPv429permute_pooled_embs_split_gpuRKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorERKN2at6TensorE"]], "fp8quantizedtofloat_ref (c++ function)": [[6, "_CPPv423FP8QuantizedToFloat_refPCK7uint8_tK6size_tK6size_tPCfKiKi"]], "fp8rowwise_to_float_cpu (c++ function)": [[6, "_CPPv423FP8rowwise_to_float_cpuRK6TensorbK7int64_t"]], "floattofp8quantized_ref (c++ function)": [[6, "_CPPv423FloatToFP8Quantized_refPCKfK6size_tK6size_tPC7uint8_tKiKiKd"]], "_float_to_fused8bitrowwise_cpu_out (c++ function)": [[6, "_CPPv434_float_to_fused8bitrowwise_cpu_outR6TensorRK6Tensor"]], "_float_to_fusednbitrowwise_gpu_t (c++ function)": [[6, "_CPPv4I0E32_float_to_fusednbitrowwise_gpu_t6TensorRK6TensorK7int64_t"]], "_fused8bitrowwise_to_float_cpu_out (c++ function)": [[6, "_CPPv434_fused8bitrowwise_to_float_cpu_outR6TensorRK6Tensor"]], "_fusednbitrowwise_to_float_gpu_t (c++ function)": [[6, "_CPPv4I0E32_fusednbitrowwise_to_float_gpu_t6TensorRK6TensorK7int64_t"]], "float_or_half_to_fused8bitrowwise_cpu (c++ function)": [[6, "_CPPv437float_or_half_to_fused8bitrowwise_cpuRK6Tensor"]], "float_to_fp8rowwise_cpu (c++ function)": [[6, "_CPPv423float_to_FP8rowwise_cpuRK6Tensorb"]], "float_to_fused8bitrowwise_cpu (c++ function)": [[6, "_CPPv429float_to_fused8bitrowwise_cpuRK6Tensor"]], "fused8bitrowwise_to_float_cpu (c++ function)": [[6, "_CPPv429fused8bitrowwise_to_float_cpuRK6Tensor"]], "fused8bitrowwise_to_float_or_half_cpu (c++ function)": [[6, "_CPPv437fused8bitrowwise_to_float_or_half_cpuRK6TensorK7int64_t"]], "fused8bitrowwise_to_half_cpu (c++ function)": [[6, "_CPPv428fused8bitrowwise_to_half_cpuRK6Tensor"]], "fusednbitrowwise_to_float_cpu (c++ function)": [[6, "_CPPv429fusednbitrowwise_to_float_cpuRK6TensorK7int64_t"]], "fusednbitrowwise_to_float_or_half_cpu (c++ function)": [[6, "_CPPv437fusednbitrowwise_to_float_or_half_cpuRK6TensorK7int64_tK7int64_t"]], "fusednbitrowwise_to_half_cpu (c++ function)": [[6, "_CPPv428fusednbitrowwise_to_half_cpuRK6TensorK7int64_t"]], "half_to_fused8bitrowwise_cpu (c++ function)": [[6, "_CPPv428half_to_fused8bitrowwise_cpuRK6Tensor"]], "expand_into_jagged_permute_cuda (c++ function)": [[7, "_CPPv431expand_into_jagged_permute_cudaRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_t"]], "generic_histogram_binning_calibration_by_feature_cpu (c++ function)": [[7, "_CPPv452generic_histogram_binning_calibration_by_feature_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorE7int64_tRKN2at6TensorERKN2at6TensorERKN2at6TensorEd7int64_td"]], "histogram_binning_calibration_cpu (c++ function)": [[7, "_CPPv433histogram_binning_calibration_cpuRKN2at6TensorERKN2at6TensorERKN2at6TensorEddd7int64_td"]], "direct_mapped_lru_cache_populate_byte_cuda (c++ function)": [[8, "_CPPv442direct_mapped_lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE"]], "direct_mapped_lxu_cache_lookup_cuda (c++ function)": [[8, "_CPPv435direct_mapped_lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE"]], "get_unique_indices_cuda (c++ function)": [[8, "_CPPv423get_unique_indices_cudaN2at6TensorE7int64_tb"]], "host_lxu_cache_slot (c++ function)": [[8, "_CPPv419host_lxu_cache_slot7int64_t7int64_t"]], "lfu_cache_populate_byte_cuda (c++ function)": [[8, "_CPPv428lfu_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t"]], "lfu_cache_populate_cuda (c++ function)": [[8, "_CPPv423lfu_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEb"]], "linearize_cache_indices_cuda (c++ function)": [[8, "_CPPv428linearize_cache_indices_cudaN2at6TensorEN2at6TensorEN2at6TensorE"]], "linearize_cache_indices_from_row_idx_cuda (c++ function)": [[8, "_CPPv441linearize_cache_indices_from_row_idx_cudaN2at6TensorEN2at6TensorEN2at6TensorE"]], "lru_cache_find_uncached_cuda (c++ function)": [[8, "_CPPv428lru_cache_find_uncached_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tN2at6TensorEbN2at6TensorEbN2at6TensorE"]], "lru_cache_populate_byte_cuda (c++ function)": [[8, "_CPPv428lru_cache_populate_byte_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEE"]], "lru_cache_populate_cuda (c++ function)": [[8, "_CPPv423lru_cache_populate_cudaN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEbbN3c108optionalIN2at6TensorEEEbN3c108optionalIN2at6TensorEEE"]], "lxu_cache_flush_cuda (c++ function)": [[8, "_CPPv420lxu_cache_flush_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_tN2at6TensorEN2at6TensorEb"]], "lxu_cache_locations_update_cuda (c++ function)": [[8, "_CPPv431lxu_cache_locations_update_cudaN2at6TensorEN2at6TensorEN3c108optionalIN2at6TensorEEE"]], "lxu_cache_locking_counter_decrement_cuda (c++ function)": [[8, "_CPPv440lxu_cache_locking_counter_decrement_cudaN2at6TensorEN2at6TensorE"]], "lxu_cache_lookup_cuda (c++ function)": [[8, "_CPPv421lxu_cache_lookup_cudaN2at6TensorEN2at6TensorE7int64_tbN3c108optionalIN2at6TensorEEEN3c108optionalIN2at6TensorEEEN3c108optionalIN2at6TensorEEE"]], "reset_weight_momentum_cuda (c++ function)": [[8, "_CPPv426reset_weight_momentum_cudaN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorEN2at6TensorE7int64_t"]], "batched_dense_vec_jagged_2d_mul() (in module torch.ops.fbgemm)": [[14, "torch.ops.fbgemm.batched_dense_vec_jagged_2d_mul"]], "dense_to_jagged() (in module torch.ops.fbgemm)": [[14, "torch.ops.fbgemm.dense_to_jagged"]], "fbgemm_gpu": [[14, "module-fbgemm_gpu"], [15, "module-fbgemm_gpu"]], "jagged_1d_to_dense() (in module torch.ops.fbgemm)": [[14, "torch.ops.fbgemm.jagged_1d_to_dense"]], "jagged_2d_to_dense() (in module torch.ops.fbgemm)": [[14, "torch.ops.fbgemm.jagged_2d_to_dense"]], "jagged_dense_dense_elementwise_add_jagged_output() (in module torch.ops.fbgemm)": [[14, "torch.ops.fbgemm.jagged_dense_dense_elementwise_add_jagged_output"]], "jagged_dense_elementwise_add() (in module torch.ops.fbgemm)": [[14, "torch.ops.fbgemm.jagged_dense_elementwise_add"]], "jagged_dense_elementwise_add_jagged_output() (in module torch.ops.fbgemm)": [[14, "torch.ops.fbgemm.jagged_dense_elementwise_add_jagged_output"]], "jagged_dense_elementwise_mul() (in module torch.ops.fbgemm)": [[14, "torch.ops.fbgemm.jagged_dense_elementwise_mul"]], "jagged_to_padded_dense() (in module torch.ops.fbgemm)": [[14, "torch.ops.fbgemm.jagged_to_padded_dense"]], "module": [[14, "module-fbgemm_gpu"], [15, "module-fbgemm_gpu"]], "stacked_jagged_1d_to_dense() (in module torch.ops.fbgemm)": [[14, "torch.ops.fbgemm.stacked_jagged_1d_to_dense"]], "stacked_jagged_2d_to_dense() (in module torch.ops.fbgemm)": [[14, "torch.ops.fbgemm.stacked_jagged_2d_to_dense"]], "splittablebatchedembeddingbagscodegen() (in module fbgemm_gpu.split_table_batched_embeddings_ops)": [[15, "fbgemm_gpu.split_table_batched_embeddings_ops.SplitTableBatchedEmbeddingBagsCodegen"]], "documentation": [[19, "term-Documentation"]], "pep 287": [[19, "index-0"]], "python enhancement proposals": [[19, "index-0"]], "rfc": [[19, "index-1"]], "rfc 2822": [[19, "index-1"]], "reading": [[19, "term-Reading"]], "writing": [[19, "term-Writing"]]}}) \ No newline at end of file diff --git a/sparse__async__cumsum_8cu.html b/sparse__async__cumsum_8cu.html new file mode 100644 index 000000000..663c941a2 --- /dev/null +++ b/sparse__async__cumsum_8cu.html @@ -0,0 +1,190 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_async_cumsum.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_async_cumsum.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +

                                Function Documentation

                                + +

                                ◆ FBGEMM_OP_DISPATCH() [1/3]

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                FBGEMM_OP_DISPATCH (CUDA ,
                                "asynchronous_complete_cumsum" ,
                                fbgemm_gpu::asynchronous_complete_cumsum_gpu  )
                                +
                                + +
                                +
                                + +

                                ◆ FBGEMM_OP_DISPATCH() [2/3]

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                FBGEMM_OP_DISPATCH (CUDA ,
                                "asynchronous_exclusive_cumsum" ,
                                fbgemm_gpu::asynchronous_exclusive_cumsum_gpu  )
                                +
                                + +
                                +
                                + +

                                ◆ FBGEMM_OP_DISPATCH() [3/3]

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                FBGEMM_OP_DISPATCH (CUDA ,
                                "asynchronous_inclusive_cumsum" ,
                                fbgemm_gpu::asynchronous_inclusive_cumsum_gpu  )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__batched__unary__embeddings_8cu.html b/sparse__batched__unary__embeddings_8cu.html new file mode 100644 index 000000000..6b73c2bd1 --- /dev/null +++ b/sparse__batched__unary__embeddings_8cu.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_batched_unary_embeddings.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_batched_unary_embeddings.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__block__bucketize__features_8cu.html b/sparse__block__bucketize__features_8cu.html new file mode 100644 index 000000000..37695bd36 --- /dev/null +++ b/sparse__block__bucketize__features_8cu.html @@ -0,0 +1,138 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_block_bucketize_features.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +

                                Function Documentation

                                + +

                                ◆ FBGEMM_OP_DISPATCH()

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                FBGEMM_OP_DISPATCH (CUDA ,
                                "block_bucketize_sparse_features" ,
                                fbgemm_gpu::block_bucketize_sparse_features_cuda  )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__bucketize__features_8cu.html b/sparse__bucketize__features_8cu.html new file mode 100644 index 000000000..aaa588ed7 --- /dev/null +++ b/sparse__bucketize__features_8cu.html @@ -0,0 +1,145 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_bucketize_features.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_bucketize_features.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                + + + + +

                                +Functions

                                template<int NUM_JAGGED_DIM, typename index_t , typename scalar_t , typename F >
                                __global__ __launch_bounds__ (kMaxThreads) void jagged_jagged_elementwise_dense_output_kernel_(const pta
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +

                                Function Documentation

                                + +

                                ◆ FBGEMM_OP_DISPATCH()

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                FBGEMM_OP_DISPATCH (CUDA ,
                                "bucketize_sparse_features" ,
                                fbgemm_gpu::bucketize_sparse_features_cuda  )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__compute__frequency__sequence_8cu.html b/sparse__compute__frequency__sequence_8cu.html new file mode 100644 index 000000000..5b0c98b19 --- /dev/null +++ b/sparse__compute__frequency__sequence_8cu.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_compute_frequency_sequence.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_compute_frequency_sequence.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__expand__into__jagged__permute_8cu.html b/sparse__expand__into__jagged__permute_8cu.html new file mode 100644 index 000000000..1441e8edc --- /dev/null +++ b/sparse__expand__into__jagged__permute_8cu.html @@ -0,0 +1,144 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_expand_into_jagged_permute.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_expand_into_jagged_permute.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                + + + +

                                +Functions

                                at::Tensor expand_into_jagged_permute_cuda (const at::Tensor &permute, const at::Tensor &input_offsets, const at::Tensor &output_offsets, int64_t output_size)
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +

                                Function Documentation

                                + +

                                ◆ FBGEMM_OP_DISPATCH()

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                FBGEMM_OP_DISPATCH (CUDA ,
                                "expand_into_jagged_permute" ,
                                fbgemm_gpu::expand_into_jagged_permute_cuda  )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__group__index_8cu.html b/sparse__group__index_8cu.html new file mode 100644 index 000000000..207a74624 --- /dev/null +++ b/sparse__group__index_8cu.html @@ -0,0 +1,149 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_group_index.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_group_index.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                +

                                Macro Definition Documentation

                                + +

                                ◆ INVOKE_GROUP_INDEX_SELECT_OR_ADD

                                + +
                                +
                                + + + + + + + + + + + +
                                #define INVOKE_GROUP_INDEX_SELECT_OR_ADD( USE_INDEX_SELECT,
                                USE_VAR_COLS )
                                +
                                +Value:
                                group_index_select_or_add_2d_kernel< \
                                +
                                index_t, \
                                +
                                scalar_t, \
                                +
                                USE_INDEX_SELECT, \
                                +
                                USE_VAR_COLS, \
                                +
                                GROUP_INDEX_SELECT_UNROLL_FACTOR, \
                                +
                                GROUP_INDEX_SELECT_COLS_PER_WARP, \
                                +
                                GROUP_INDEX_SELECT_LOG_COLS_PER_WARP> \
                                +
                                <<<grid_size, block_size, 0, at::cuda::getCurrentCUDAStream()>>>( \
                                +
                                input_ptrs, \
                                +
                                output_ptrs, \
                                +
                                indices_ptrs, \
                                +
                                warp_offsets_group, \
                                +
                                num_cols_group, \
                                +
                                num_work_rows, \
                                +
                                group_size)
                                +
                                +
                                +
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__index__add_8cu.html b/sparse__index__add_8cu.html new file mode 100644 index 000000000..fd094259a --- /dev/null +++ b/sparse__index__add_8cu.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_index_add.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_index_add.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__index__select_8cu.html b/sparse__index__select_8cu.html new file mode 100644 index 000000000..8093a52b1 --- /dev/null +++ b/sparse__index__select_8cu.html @@ -0,0 +1,158 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_index_select.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_index_select.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                + + + + +

                                +Functions

                                template<int NUM_JAGGED_DIM, typename index_t , typename scalar_t , typename F >
                                __global__ __launch_bounds__ (kMaxThreads) void jagged_jagged_elementwise_dense_output_kernel_(const pta
                                 
                                +

                                Macro Definition Documentation

                                + +

                                ◆ LAUNCH_INDEX_SELECT

                                + +
                                +
                                + + + + + + + +
                                #define LAUNCH_INDEX_SELECT( INDICES_SORTED)
                                +
                                +Value:
                                TORCH_DSA_KERNEL_LAUNCH( \
                                +
                                (index_select_2d_kernel< \
                                +
                                index_t, \
                                +
                                scalar_t, \
                                +
                                UNROLL_FACTOR, \
                                +
                                INDICES_SORTED>), \
                                + +
                                std::min(div_round_up(D, UNROLL_FACTOR), kMaxThreads), \
                                +
                                0, \
                                +
                                at::cuda::getCurrentCUDAStream(), \
                                +
                                input_reshaped.packed_accessor64<scalar_t, 2, at::RestrictPtrTraits>(), \
                                +
                                indices.packed_accessor64<index_t, 1, at::RestrictPtrTraits>(), \
                                +
                                INDICES_SORTED \
                                +
                                ? orig_indices \
                                +
                                .packed_accessor64<int64_t, 1, at::RestrictPtrTraits>() \
                                +
                                : dummy_packed_accessor64<int64_t, 1, at::RestrictPtrTraits>(), \
                                +
                                output.packed_accessor64<scalar_t, 2>());
                                +
                                template int64_t
                                Definition gen_batch_index_select_dim0_forward_kernel.cu:1241
                                +
                                template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const int32_t const bool pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > output
                                Definition gen_batch_index_select_dim0_forward_kernel_small.cu:128
                                +
                                template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int32_t, 1, at::RestrictPtrTraits > FixedDivisor const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > indices
                                Definition gen_batch_index_select_dim0_forward_kernel_small.cu:123
                                +
                                template const pta::PackedTensorAccessor64< uint8_t, 1, at::RestrictPtrTraits > const pta::PackedTensorAccessor32< int64_t, 1, at::RestrictPtrTraits > int64_t D
                                Definition gen_embedding_forward_dense_unweighted_nobag_kernel_small.cu:101
                                +
                                constexpr uint32_t cuda_calc_xblock_count(Integer1 num_items, Integer2 threads_per_block)
                                Definition sparse_ops_utils.h:353
                                +
                                +
                                +
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__invert__permute_8cu.html b/sparse__invert__permute_8cu.html new file mode 100644 index 000000000..3883b1da3 --- /dev/null +++ b/sparse__invert__permute_8cu.html @@ -0,0 +1,138 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_invert_permute.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_invert_permute.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +

                                Function Documentation

                                + +

                                ◆ FBGEMM_OP_DISPATCH()

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                FBGEMM_OP_DISPATCH (CUDA ,
                                "invert_permute" ,
                                fbgemm_gpu::invert_permute_cuda  )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__ops_2common_8cuh.html b/sparse__ops_2common_8cuh.html new file mode 100644 index 000000000..51d114276 --- /dev/null +++ b/sparse__ops_2common_8cuh.html @@ -0,0 +1,147 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/common.cuh File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                common.cuh File Reference
                                +
                                +
                                +
                                #include "fbgemm_gpu/ops_utils.h"
                                +#include "fbgemm_gpu/sparse_ops.cuh"
                                +#include "fbgemm_gpu/sparse_ops.h"
                                +#include "fbgemm_gpu/sparse_ops_utils.h"
                                +#include <ATen/ATen.h>
                                +#include <ATen/Dispatch.h>
                                +#include <ATen/core/op_registration/op_registration.h>
                                +#include <ATen/cuda/CUDAContext.h>
                                +#include <ATen/cuda/Exceptions.h>
                                +#include <c10/cuda/CUDADeviceAssertion.h>
                                +#include <c10/cuda/CUDADeviceAssertionHost.h>
                                +#include <c10/cuda/CUDAGuard.h>
                                +#include <torch/library.h>
                                +#include "fbgemm_gpu/cub_namespace_prefix.cuh"
                                +#include <cub/device/device_scan.cuh>
                                +#include "fbgemm_gpu/cub_namespace_postfix.cuh"
                                +#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
                                +#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
                                +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                +

                                Macro Definition Documentation

                                + +

                                ◆ LDG

                                + +
                                +
                                + + + + + + + +
                                #define LDG( ptr)   (__ldg(ptr))
                                +
                                + +
                                +
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__ops_8cuh.html b/sparse__ops_8cuh.html new file mode 100644 index 000000000..517811c29 --- /dev/null +++ b/sparse__ops_8cuh.html @@ -0,0 +1,93 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.cuh File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                +
                                sparse_ops.cuh File Reference
                                +
                                +
                                +
                                #include <ATen/ATen.h>
                                +#include <ATen/cuda/detail/KernelUtils.h>
                                +#include <cuda.h>
                                +#include "./cub_namespace_prefix.cuh"
                                +#include <cub/block/block_reduce.cuh>
                                +#include "./cub_namespace_postfix.cuh"
                                +
                                + + + + diff --git a/sparse__ops_8h.html b/sparse__ops_8h.html new file mode 100644 index 000000000..fc067ccb8 --- /dev/null +++ b/sparse__ops_8h.html @@ -0,0 +1,107 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_ops.h File Reference
                                +
                                +
                                +
                                #include <ATen/ATen.h>
                                +#include <cstdint>
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                + + + + + + + +

                                +Functions

                                at::Tensor expand_into_jagged_permute_cuda (const at::Tensor &permute, const at::Tensor &input_offsets, const at::Tensor &output_offsets, int64_t output_size)
                                 
                                std::tuple< at::Tensor, at::Tensor > histogram_binning_calibration_cpu (const at::Tensor &logit, const at::Tensor &bin_num_examples, const at::Tensor &bin_num_positives, double positive_weight, double lower_bound=0.0, double upper_bound=1.0, int64_t bin_ctr_in_use_after=0, double bin_ctr_weight_value=1.0)
                                 
                                std::tuple< at::Tensor, at::Tensor > generic_histogram_binning_calibration_by_feature_cpu (const at::Tensor &logit, const at::Tensor &segment_value, const at::Tensor &segment_lengths, int64_t num_segments, const at::Tensor &bin_num_examples, const at::Tensor &bin_num_positives, const at::Tensor &bin_boundaries, double positive_weight, int64_t bin_ctr_in_use_after=0, double bin_ctr_weight_value=1.0)
                                 
                                +
                                + + + + diff --git a/sparse__ops__cpu_8cpp.html b/sparse__ops__cpu_8cpp.html new file mode 100644 index 000000000..9a19f0917 --- /dev/null +++ b/sparse__ops__cpu_8cpp.html @@ -0,0 +1,263 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_ops_cpu.cpp File Reference
                                +
                                +
                                +
                                #include <algorithm>
                                +#include <cmath>
                                +#include <functional>
                                +#include <ATen/ATen.h>
                                +#include <ATen/TypeDefault.h>
                                +#include <torch/library.h>
                                +#include "ATen/Parallel.h"
                                +#include <ATen/core/dispatch/Dispatcher.h>
                                +#include <torch/csrc/autograd/custom_function.h>
                                +#include "c10/util/MaybeOwned.h"
                                +#include "fbgemm_gpu/dispatch_macros.h"
                                +#include "fbgemm_gpu/sparse_ops.h"
                                +#include "fbgemm_gpu/sparse_ops_utils.h"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                + + + + + + + + + + + +

                                +Functions

                                Tensor batched_unary_embeddings_forward_cpu (const Tensor &weight, const Tensor &table_offsets, const Tensor &offsets, const Tensor &indices)
                                 
                                std::tuple< at::Tensor, at::Tensor > histogram_binning_calibration_cpu (const at::Tensor &logit, const at::Tensor &bin_num_examples, const at::Tensor &bin_num_positives, double positive_weight, double lower_bound=0.0, double upper_bound=1.0, int64_t bin_ctr_in_use_after=0, double bin_ctr_weight_value=1.0)
                                 
                                std::tuple< at::Tensor, at::Tensor > generic_histogram_binning_calibration_by_feature_cpu (const at::Tensor &logit, const at::Tensor &segment_value, const at::Tensor &segment_lengths, int64_t num_segments, const at::Tensor &bin_num_examples, const at::Tensor &bin_num_positives, const at::Tensor &bin_boundaries, double positive_weight, int64_t bin_ctr_in_use_after=0, double bin_ctr_weight_value=1.0)
                                 
                                Tensor pack_segments_forward_cpu (const Tensor &t_in, const Tensor &lengths, const int64_t max_length)
                                 
                                Tensor pack_segments_backward_cpu (const Tensor &data, const Tensor &lengths, const int64_t total_length, const int64_t max_length)
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +

                                Function Documentation

                                + +

                                ◆ TORCH_LIBRARY_FRAGMENT()

                                + +
                                +
                                + + + + + + + + + + + +
                                TORCH_LIBRARY_FRAGMENT (fbgemm ,
                                m  )
                                +
                                + +
                                +
                                + +

                                ◆ TORCH_LIBRARY_IMPL() [1/4]

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                TORCH_LIBRARY_IMPL (fbgemm ,
                                Autograd ,
                                m  )
                                +
                                + +
                                +
                                + +

                                ◆ TORCH_LIBRARY_IMPL() [2/4]

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                TORCH_LIBRARY_IMPL (fbgemm ,
                                AutogradCPU ,
                                m  )
                                +
                                + +
                                +
                                + +

                                ◆ TORCH_LIBRARY_IMPL() [3/4]

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                TORCH_LIBRARY_IMPL (fbgemm ,
                                CPU ,
                                m  )
                                +
                                + +
                                +
                                + +

                                ◆ TORCH_LIBRARY_IMPL() [4/4]

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                TORCH_LIBRARY_IMPL (fbgemm ,
                                Meta ,
                                m  )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__ops__gpu_8cpp.html b/sparse__ops__gpu_8cpp.html new file mode 100644 index 000000000..2a820b74e --- /dev/null +++ b/sparse__ops__gpu_8cpp.html @@ -0,0 +1,222 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_ops_gpu.cpp File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_ops_gpu.cpp File Reference
                                +
                                +
                                +
                                #include "ATen/ops/tensor.h"
                                +#include "c10/core/SymInt.h"
                                +#include "c10/core/TensorOptions.h"
                                +#include "fbgemm_gpu/sparse_ops.h"
                                +#include "fbgemm_gpu/sparse_ops_utils.h"
                                +#include <ATen/ATen.h>
                                +#include <ATen/core/op_registration/op_registration.h>
                                +#include <torch/csrc/autograd/custom_function.h>
                                +#include <torch/library.h>
                                +#include <torch/script.h>
                                +#include <cstdint>
                                +#include <stdexcept>
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +

                                Function Documentation

                                + +

                                ◆ TORCH_LIBRARY_FRAGMENT()

                                + +
                                +
                                + + + + + + + + + + + +
                                TORCH_LIBRARY_FRAGMENT (fbgemm ,
                                m  )
                                +
                                + +
                                +
                                + +

                                ◆ TORCH_LIBRARY_IMPL() [1/3]

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                TORCH_LIBRARY_IMPL (fbgemm ,
                                AutogradCUDA ,
                                m  )
                                +
                                + +
                                +
                                + +

                                ◆ TORCH_LIBRARY_IMPL() [2/3]

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                TORCH_LIBRARY_IMPL (fbgemm ,
                                CUDA ,
                                m  )
                                +
                                + +
                                +
                                + +

                                ◆ TORCH_LIBRARY_IMPL() [3/3]

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                TORCH_LIBRARY_IMPL (fbgemm ,
                                Meta ,
                                m  )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__ops__meta_8cpp.html b/sparse__ops__meta_8cpp.html new file mode 100644 index 000000000..c5670d976 --- /dev/null +++ b/sparse__ops__meta_8cpp.html @@ -0,0 +1,145 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_ops_meta.cpp File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_ops_meta.cpp File Reference
                                +
                                +
                                +
                                #include <ATen/ATen.h>
                                +#include <ATen/AccumulateType.h>
                                +#include <torch/csrc/autograd/custom_function.h>
                                +#include <torch/library.h>
                                +#include "c10/core/SymIntArrayRef.h"
                                +#include "c10/util/DimVector.h"
                                +#include "fbgemm_gpu/sparse_ops.h"
                                +#include "fbgemm_gpu/sparse_ops_utils.h"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +

                                Function Documentation

                                + +

                                ◆ TORCH_LIBRARY_IMPL()

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                TORCH_LIBRARY_IMPL (fbgemm ,
                                Meta ,
                                m  )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__ops__utils_8h.html b/sparse__ops__utils_8h.html new file mode 100644 index 000000000..645114987 --- /dev/null +++ b/sparse__ops__utils_8h.html @@ -0,0 +1,1277 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/sparse_ops_utils.h File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_ops_utils.h File Reference
                                +
                                +
                                +
                                #include <ATen/ATen.h>
                                +#include <cstdint>
                                +#include <optional>
                                +#include <string>
                                +
                                + + + + + + + + + +

                                +Classes

                                struct  StackArray< T >
                                 
                                struct  log2_calc_< x >
                                 
                                struct  log2_calc_< 0 >
                                 
                                struct  log2_calc< x >
                                 
                                + + + + + + + +

                                +Functions

                                template<typename Integer1 , typename Integer2 , std::enable_if_t< std::is_integral< Integer1 >::value, bool > = true, std::enable_if_t< std::is_integral< Integer2 >::value, bool > = true>
                                constexpr uint32_t cuda_calc_xblock_count_base (Integer1 num_items, Integer2 threads_per_block)
                                 
                                template<typename Integer1 , typename Integer2 , std::enable_if_t< std::is_integral< Integer1 >::value, bool > = true, std::enable_if_t< std::is_integral< Integer2 >::value, bool > = true>
                                constexpr uint32_t cuda_calc_block_count (Integer1 num_items, Integer2 threads_per_block)
                                 
                                +

                                Macro Definition Documentation

                                + +

                                ◆ DISPATCH_TO_ALL

                                + +
                                +
                                + + + + + + + + + + + +
                                #define DISPATCH_TO_ALL( name,
                                function )    m.impl(name, torch::dispatch(c10::DispatchKey::CatchAll, TORCH_FN(function)))
                                +
                                + +
                                +
                                + +

                                ◆ DISPATCH_TO_AUTOGRAD

                                + +
                                +
                                + + + + + + + + + + + +
                                #define DISPATCH_TO_AUTOGRAD( name,
                                function )    m.impl(name, torch::dispatch(c10::DispatchKey::Autograd, TORCH_FN(function)))
                                +
                                + +
                                +
                                + +

                                ◆ DISPATCH_TO_AUTOGRAD_CUDA

                                + +
                                +
                                + + + + + + + + + + + +
                                #define DISPATCH_TO_AUTOGRAD_CUDA( name,
                                function )
                                +
                                +Value:
                                m.impl( \
                                +
                                name, \
                                +
                                torch::dispatch(c10::DispatchKey::AutogradCUDA, TORCH_FN(function)))
                                +
                                +
                                +
                                + +

                                ◆ DISPATCH_TO_AUTOGRAD_META

                                + +
                                +
                                + + + + + + + + + + + +
                                #define DISPATCH_TO_AUTOGRAD_META( name,
                                function )
                                +
                                +Value:
                                m.impl( \
                                +
                                name, \
                                +
                                torch::dispatch(c10::DispatchKey::AutogradMETA, TORCH_FN(function)))
                                +
                                +
                                +
                                + +

                                ◆ DISPATCH_TO_CPU

                                + +
                                +
                                + + + + + + + + + + + +
                                #define DISPATCH_TO_CPU( name,
                                function )    m.impl(name, torch::dispatch(c10::DispatchKey::CPU, TORCH_FN(function)))
                                +
                                + +
                                +
                                + +

                                ◆ DISPATCH_TO_CUDA

                                + +
                                +
                                + + + + + + + + + + + +
                                #define DISPATCH_TO_CUDA( name,
                                function )    m.impl(name, torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(function)))
                                +
                                + +
                                +
                                + +

                                ◆ DISPATCH_TO_META

                                + +
                                +
                                + + + + + + + + + + + +
                                #define DISPATCH_TO_META( name,
                                function )    m.impl(name, torch::dispatch(c10::DispatchKey::Meta, TORCH_FN(function)))
                                +
                                + +
                                +
                                + +

                                ◆ JAGGED_TENSOR_DISPATCH_DIMS

                                + +
                                +
                                + + + + + + + +
                                #define JAGGED_TENSOR_DISPATCH_DIMS()
                                +
                                +Value:
                                AT_DISPATCH_INDEX_TYPES(x_offsets[0].scalar_type(), "jagged_indices", [=] { \
                                +
                                switch (num_jagged_dim) { \
                                +
                                case 1: \
                                +
                                INVOKE_KERNEL_WITH_DIM(1); \
                                +
                                break; \
                                +
                                case 2: \
                                +
                                INVOKE_KERNEL_WITH_DIM(2); \
                                +
                                break; \
                                +
                                case 3: \
                                +
                                INVOKE_KERNEL_WITH_DIM(3); \
                                +
                                break; \
                                +
                                case 4: \
                                +
                                INVOKE_KERNEL_WITH_DIM(4); \
                                +
                                break; \
                                +
                                case 5: \
                                +
                                INVOKE_KERNEL_WITH_DIM(5); \
                                +
                                break; \
                                +
                                default: \
                                +
                                TORCH_CHECK( \
                                +
                                false, "unsupported number of jagged dim ", num_jagged_dim); \
                                +
                                } \
                                +
                                });
                                +
                                +
                                +
                                + +

                                ◆ TENSOR_CONTIGUOUS

                                + +
                                +
                                + + + + + + + +
                                #define TENSOR_CONTIGUOUS( x)    TORCH_CHECK((x).is_contiguous(), #x " must be contiguous")
                                +
                                + +
                                +
                                + +

                                ◆ TENSOR_CONTIGUOUS_AND_ON_CPU

                                + +
                                +
                                + + + + + + + +
                                #define TENSOR_CONTIGUOUS_AND_ON_CPU( x)
                                +
                                +Value:
                                +
                                TENSOR_CONTIGUOUS(x)
                                +
                                #define TENSOR_ON_CPU(x)
                                Definition sparse_ops_utils.h:124
                                +
                                +
                                +
                                + +

                                ◆ TENSOR_CONTIGUOUS_AND_ON_CUDA_GPU

                                + +
                                +
                                + + + + + + + +
                                #define TENSOR_CONTIGUOUS_AND_ON_CUDA_GPU( x)
                                +
                                +Value:
                                +
                                TENSOR_CONTIGUOUS(x)
                                +
                                #define TENSOR_ON_CUDA_GPU(x)
                                Definition sparse_ops_utils.h:136
                                +
                                +
                                +
                                + +

                                ◆ TENSOR_EMPTY_OR_ON_CPU

                                + +
                                +
                                + + + + + + + +
                                #define TENSOR_EMPTY_OR_ON_CPU( x)
                                +
                                +Value:
                                TORCH_CHECK( \
                                + +
                                #x " must be empty or a CPU tensor; it is currently on device ", \
                                + +
                                std::string torch_tensor_device_name(const at::Tensor &ten)
                                Definition sparse_ops_utils.h:38
                                +
                                bool torch_tensor_empty_or_on_cpu_check(const at::Tensor &ten)
                                Definition sparse_ops_utils.h:90
                                +
                                +
                                +
                                + +

                                ◆ TENSOR_EMPTY_OR_ON_CUDA_GPU

                                + +
                                +
                                + + + + + + + +
                                #define TENSOR_EMPTY_OR_ON_CUDA_GPU( x)
                                +
                                +Value:
                                TORCH_CHECK( \
                                + +
                                #x " must be empty or a CUDA tensor; it is currently on device ", \
                                + +
                                bool torch_tensor_empty_or_on_cuda_gpu_check(const at::Tensor &ten)
                                Definition sparse_ops_utils.h:80
                                +
                                +
                                +
                                + +

                                ◆ TENSOR_NDIM_EQUALS

                                + +
                                +
                                + + + + + + + + + + + +
                                #define TENSOR_NDIM_EQUALS( ten,
                                dims )
                                +
                                +Value:
                                TORCH_CHECK( \
                                +
                                (ten).ndimension() == (dims), \
                                +
                                "Tensor '" #ten "' must have " #dims \
                                +
                                " dimension(s). " \
                                +
                                "Found ", \
                                +
                                (ten).ndimension())
                                +
                                +
                                +
                                + +

                                ◆ TENSOR_NDIM_EXCEEDS

                                + +
                                +
                                + + + + + + + + + + + +
                                #define TENSOR_NDIM_EXCEEDS( ten,
                                dims )
                                +
                                +Value:
                                TORCH_CHECK( \
                                +
                                (ten).dim() > (dims), \
                                +
                                "Tensor '" #ten "' must have more than " #dims \
                                +
                                " dimension(s). " \
                                +
                                "Found ", \
                                +
                                (ten).ndimension())
                                +
                                +
                                +
                                + +

                                ◆ TENSOR_NDIM_IS_GE

                                + +
                                +
                                + + + + + + + + + + + +
                                #define TENSOR_NDIM_IS_GE( ten,
                                dims )
                                +
                                +Value:
                                TORCH_CHECK( \
                                +
                                (ten).dim() >= (dims), \
                                +
                                "Tensor '" #ten "' must have >=" #dims \
                                +
                                " dimension(s). " \
                                +
                                "Found ", \
                                +
                                (ten).ndimension())
                                +
                                +
                                +
                                + +

                                ◆ TENSOR_ON_CPU

                                + +
                                +
                                + + + + + + + +
                                #define TENSOR_ON_CPU( x)
                                +
                                +Value:
                                TORCH_CHECK( \
                                + +
                                #x " must be a CPU tensor; it is currently on device ", \
                                + +
                                bool torch_tensor_on_cpu_check(const at::Tensor &ten)
                                Definition sparse_ops_utils.h:16
                                +
                                +
                                +
                                + +

                                ◆ TENSOR_ON_CUDA_GPU

                                + +
                                +
                                + + + + + + + +
                                #define TENSOR_ON_CUDA_GPU( x)
                                +
                                +Value:
                                TORCH_CHECK( \
                                + +
                                #x " must be a CUDA tensor; it is currently on device ", \
                                + +
                                bool torch_tensor_on_cuda_gpu_check(const at::Tensor &ten)
                                Definition sparse_ops_utils.h:71
                                +
                                +
                                +
                                + +

                                ◆ TENSOR_TYPE_MUST_BE

                                + +
                                +
                                + + + + + + + + + + + +
                                #define TENSOR_TYPE_MUST_BE( ten,
                                typ )
                                +
                                +Value:
                                TORCH_CHECK( \
                                +
                                (ten).scalar_type() == typ, \
                                +
                                "Tensor '" #ten "' must have scalar type " #typ " but it had type ", \
                                +
                                (ten).dtype().name())
                                +
                                +
                                +
                                + +

                                ◆ TENSORS_EMPTY_OR_ON_SAME_DEVICE

                                + +
                                +
                                + + + + + + + + + + + +
                                #define TENSORS_EMPTY_OR_ON_SAME_DEVICE( x,
                                y )
                                +
                                +Value:
                                TORCH_CHECK( \
                                +
                                torch_tensor_on_same_device_check(x, y) || (x.numel() == 0), \
                                +
                                #x " must be empty or a CUDA tensor; it is currently on device ", \
                                + +
                                bool torch_tensor_on_same_device_check(const at::Tensor &ten1, const at::Tensor &ten2)
                                Definition sparse_ops_utils.h:51
                                +
                                +
                                +
                                + +

                                ◆ TENSORS_HAVE_SAME_NUMEL

                                + +
                                +
                                + + + + + + + + + + + +
                                #define TENSORS_HAVE_SAME_NUMEL( x,
                                y )
                                +
                                +Value:
                                TORCH_CHECK( \
                                +
                                (x).numel() == (y).numel(), \
                                +
                                #x " must have the same number of elements as " #y " They had ", \
                                +
                                (x).numel(), \
                                +
                                " and ", \
                                +
                                (y).numel())
                                +
                                +
                                +
                                + +

                                ◆ TENSORS_HAVE_SAME_TYPE

                                + +
                                +
                                + + + + + + + + + + + +
                                #define TENSORS_HAVE_SAME_TYPE( x,
                                y )
                                +
                                +Value:
                                TORCH_CHECK( \
                                +
                                (x).dtype() == (y).dtype(), \
                                +
                                #x " must have the same type as " #y " types were ", \
                                +
                                (x).dtype().name(), \
                                +
                                " and ", \
                                +
                                (y).dtype().name())
                                +
                                +
                                +
                                + +

                                ◆ TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL

                                + +
                                +
                                + + + + + + + +
                                #define TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL( ...)
                                +
                                +Value:
                                do { \
                                +
                                const auto tensors_on_same_gpu = \
                                +
                                tensor_on_same_gpu_if_not_optional_check(#__VA_ARGS__, __VA_ARGS__); \
                                +
                                TORCH_CHECK(tensors_on_same_gpu.empty(), tensors_on_same_gpu); \
                                +
                                } while (false)
                                +
                                +
                                +
                                + +

                                ◆ TENSORS_ON_SAME_DEVICE

                                + +
                                +
                                + + + + + + + + + + + +
                                #define TENSORS_ON_SAME_DEVICE( x,
                                y )
                                +
                                +Value:
                                TORCH_CHECK( \
                                + +
                                #x " must be on the same device as " #y "! " #x " is currently on ", \
                                + +
                                #y " is currently on ", \
                                + +
                                +
                                +
                                +

                                Function Documentation

                                + +

                                ◆ binary_search_range_cpu()

                                + +
                                +
                                +
                                +template<typename scalar_t >
                                + + + + + + + + + + + + + + + + + + + + + +
                                void binary_search_range_cpu (int * found,
                                const scalar_t * arr,
                                const scalar_t target,
                                const int num_entries )
                                +
                                + +
                                +
                                + +

                                ◆ cuda_calc_block_count()

                                + +
                                +
                                +
                                +template<typename Integer1 , typename Integer2 , std::enable_if_t< std::is_integral< Integer1 >::value, bool > = true, std::enable_if_t< std::is_integral< Integer2 >::value, bool > = true>
                                + + + + + +
                                + + + + + + + + + + + +
                                constexpr uint32_t cuda_calc_block_count (Integer1 num_items,
                                Integer2 threads_per_block )
                                +
                                +constexpr
                                +
                                +

                                Determine an appropriate CUDA block count.

                                +

                                See cuda_calc_xblock_count_base() for details.

                                + +
                                +
                                + +

                                ◆ cuda_calc_xblock_count()

                                + +
                                +
                                +
                                +template<typename Integer1 , typename Integer2 , std::enable_if_t< std::is_integral< Integer1 >::value &&std::is_signed< Integer2 >::value, bool > = true, std::enable_if_t< std::is_integral< Integer2 >::value &&std::is_unsigned< Integer2 >::value, bool > = true>
                                + + + + + +
                                + + + + + + + + + + + +
                                constexpr uint32_t cuda_calc_xblock_count (Integer1 num_items,
                                Integer2 threads_per_block )
                                +
                                +constexpr
                                +
                                + +
                                +
                                + +

                                ◆ cuda_calc_xblock_count_base()

                                + +
                                +
                                +
                                +template<typename Integer1 , typename Integer2 , std::enable_if_t< std::is_integral< Integer1 >::value, bool > = true, std::enable_if_t< std::is_integral< Integer2 >::value, bool > = true>
                                + + + + + +
                                + + + + + + + + + + + +
                                constexpr uint32_t cuda_calc_xblock_count_base (Integer1 num_items,
                                Integer2 threads_per_block )
                                +
                                +constexpr
                                +
                                +

                                Determine an appropriate CUDA block count along the x axis

                                +

                                When launching CUDA kernels the number of blocks B is often calculated w.r.t. the number of threads T and items to be processed N as B=(N+T-1)/T - which is integer division rounding up. This function abstracts that calculation, performs it in an overflow-safe manner, and limits the return value appropriately.

                                +

                                This is a general function for all integral data types. The goal of this set of functions is to ensure correct calculations across a variety of data types without forcing the programmer to cast to an appropriate type (which is dangerous because we don't have conversion warnings enabled). The values of the variables can then be checked for correctness at run-time. Specialized functions below handle various combinations of signed and unsigned inputs. This system prevents "pointless comparison +against zero" warnings from the compiler for unsigned types (simpler ways of suppressing this warning didn't work) while maintaining the various warnings.

                                +

                                Function is designed to facilitate run-time value checking.

                                + +
                                +
                                + +

                                ◆ get_device_index_from_tensor() [1/2]

                                + +
                                +
                                + + + + + +
                                + + + + + + + +
                                std::optional< int64_t > get_device_index_from_tensor (const at::Tensor & ten)
                                +
                                +inline
                                +
                                + +
                                +
                                + +

                                ◆ get_device_index_from_tensor() [2/2]

                                + +
                                +
                                + + + + + +
                                + + + + + + + +
                                std::optional< int64_t > get_device_index_from_tensor (const c10::optional< at::Tensor > & ten)
                                +
                                +inline
                                +
                                + +
                                +
                                + +

                                ◆ tensor_on_same_gpu_if_not_optional_check()

                                + +
                                +
                                +
                                +template<typename... Tensors>
                                + + + + + + + + + + + +
                                std::string tensor_on_same_gpu_if_not_optional_check (const std::string & var_names_str,
                                const Tensors &... tensors )
                                +
                                + +
                                +
                                + +

                                ◆ torch_tensor_device_name() [1/2]

                                + +
                                +
                                + + + + + +
                                + + + + + + + +
                                std::string torch_tensor_device_name (const at::Tensor & ten)
                                +
                                +inline
                                +
                                + +
                                +
                                + +

                                ◆ torch_tensor_device_name() [2/2]

                                + +
                                +
                                + + + + + +
                                + + + + + + + +
                                std::string torch_tensor_device_name (const c10::optional< at::Tensor > & ten)
                                +
                                +inline
                                +
                                + +
                                +
                                + +

                                ◆ torch_tensor_empty_or_on_cpu_check() [1/2]

                                + +
                                +
                                + + + + + +
                                + + + + + + + +
                                bool torch_tensor_empty_or_on_cpu_check (const at::Tensor & ten)
                                +
                                +inline
                                +
                                + +
                                +
                                + +

                                ◆ torch_tensor_empty_or_on_cpu_check() [2/2]

                                + +
                                +
                                + + + + + +
                                + + + + + + + +
                                bool torch_tensor_empty_or_on_cpu_check (const c10::optional< at::Tensor > & ten)
                                +
                                +inline
                                +
                                + +
                                +
                                + +

                                ◆ torch_tensor_empty_or_on_cuda_gpu_check() [1/2]

                                + +
                                +
                                + + + + + +
                                + + + + + + + +
                                bool torch_tensor_empty_or_on_cuda_gpu_check (const at::Tensor & ten)
                                +
                                +inline
                                +
                                + +
                                +
                                + +

                                ◆ torch_tensor_empty_or_on_cuda_gpu_check() [2/2]

                                + +
                                +
                                + + + + + +
                                + + + + + + + +
                                bool torch_tensor_empty_or_on_cuda_gpu_check (const c10::optional< at::Tensor > & ten)
                                +
                                +inline
                                +
                                + +
                                +
                                + +

                                ◆ torch_tensor_on_cpu_check() [1/2]

                                + +
                                +
                                + + + + + +
                                + + + + + + + +
                                bool torch_tensor_on_cpu_check (const at::Tensor & ten)
                                +
                                +inline
                                +
                                + +
                                +
                                + +

                                ◆ torch_tensor_on_cpu_check() [2/2]

                                + +
                                +
                                + + + + + +
                                + + + + + + + +
                                bool torch_tensor_on_cpu_check (const c10::optional< at::Tensor > & ten)
                                +
                                +inline
                                +
                                + +
                                +
                                + +

                                ◆ torch_tensor_on_cuda_gpu_check() [1/2]

                                + +
                                +
                                + + + + + +
                                + + + + + + + +
                                bool torch_tensor_on_cuda_gpu_check (const at::Tensor & ten)
                                +
                                +inline
                                +
                                + +
                                +
                                + +

                                ◆ torch_tensor_on_cuda_gpu_check() [2/2]

                                + +
                                +
                                + + + + + +
                                + + + + + + + +
                                bool torch_tensor_on_cuda_gpu_check (const c10::optional< at::Tensor > & ten)
                                +
                                +inline
                                +
                                + +
                                +
                                + +

                                ◆ torch_tensor_on_same_device_check() [1/2]

                                + +
                                +
                                + + + + + +
                                + + + + + + + + + + + +
                                bool torch_tensor_on_same_device_check (const at::Tensor & ten1,
                                const at::Tensor & ten2 )
                                +
                                +inline
                                +
                                + +
                                +
                                + +

                                ◆ torch_tensor_on_same_device_check() [2/2]

                                + +
                                +
                                + + + + + +
                                + + + + + + + + + + + +
                                bool torch_tensor_on_same_device_check (const at::Tensor & ten1,
                                const c10::optional< at::Tensor > & ten2 )
                                +
                                +inline
                                +
                                + +
                                +
                                + +

                                ◆ torch_tensor_undefined() [1/2]

                                + +
                                +
                                + + + + + +
                                + + + + + + + +
                                bool torch_tensor_undefined (const at::Tensor & ten)
                                +
                                +inline
                                +
                                + +
                                +
                                + +

                                ◆ torch_tensor_undefined() [2/2]

                                + +
                                +
                                + + + + + +
                                + + + + + + + +
                                bool torch_tensor_undefined (const c10::optional< at::Tensor > & ten)
                                +
                                +inline
                                +
                                + +
                                +
                                +

                                Variable Documentation

                                + +

                                ◆ kStackArrayMaxDims

                                + +
                                +
                                + + + + + +
                                + + + + +
                                constexpr size_t kStackArrayMaxDims = 5
                                +
                                +constexpr
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__ops__utils__test_8cpp.html b/sparse__ops__utils__test_8cpp.html new file mode 100644 index 000000000..7ba904f4c --- /dev/null +++ b/sparse__ops__utils__test_8cpp.html @@ -0,0 +1,194 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/test/sparse_ops_utils_test.cpp File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                +
                                sparse_ops_utils_test.cpp File Reference
                                +
                                +
                                +
                                #include <gmock/gmock.h>
                                +#include <gtest/gtest.h>
                                +#include <ATen/ATen.h>
                                +#include <torch/torch.h>
                                +#include "deeplearning/fbgemm/fbgemm_gpu/include/fbgemm_gpu/sparse_ops_utils.h"
                                +

                                Function Documentation

                                + +

                                ◆ get_valid_cpu_tensor()

                                + +
                                +
                                + + + + + + + +
                                at::Tensor get_valid_cpu_tensor ()
                                +
                                + +
                                +
                                + +

                                ◆ TEST() [1/4]

                                + +
                                +
                                + + + + + + + + + + + +
                                TEST (sparse_ops_utils_test ,
                                cpu_tensors_fail  )
                                +
                                + +
                                +
                                + +

                                ◆ TEST() [2/4]

                                + +
                                +
                                + + + + + + + + + + + +
                                TEST (sparse_ops_utils_test ,
                                gpu_tensors_pass  )
                                +
                                + +
                                +
                                + +

                                ◆ TEST() [3/4]

                                + +
                                +
                                + + + + + + + + + + + +
                                TEST (sparse_ops_utils_test ,
                                optional_tensor_passes  )
                                +
                                + +
                                +
                                + +

                                ◆ TEST() [4/4]

                                + +
                                +
                                + + + + + + + + + + + +
                                TEST (sparse_ops_utils_test ,
                                undefined_tensors_do_not_trigger  )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__pack__segments__backward_8cu.html b/sparse__pack__segments__backward_8cu.html new file mode 100644 index 000000000..ecf22383e --- /dev/null +++ b/sparse__pack__segments__backward_8cu.html @@ -0,0 +1,117 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_pack_segments_backward.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_pack_segments_backward.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                + + + +

                                +Functions

                                DLL_PUBLIC Tensor pack_segments_backward_cuda (const Tensor &data, const Tensor &lengths, int64_t total_length, int64_t max_length)
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__pack__segments__forward_8cu.html b/sparse__pack__segments__forward_8cu.html new file mode 100644 index 000000000..6c1f81311 --- /dev/null +++ b/sparse__pack__segments__forward_8cu.html @@ -0,0 +1,117 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_pack_segments_forward.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_pack_segments_forward.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                + + + +

                                +Functions

                                DLL_PUBLIC Tensor pack_segments_forward_cuda (const Tensor &t_in, const Tensor &lengths, const int64_t max_length)
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__permute102_8cu.html b/sparse__permute102_8cu.html new file mode 100644 index 000000000..19fce0ee6 --- /dev/null +++ b/sparse__permute102_8cu.html @@ -0,0 +1,138 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_permute102.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_permute102.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +

                                Function Documentation

                                + +

                                ◆ FBGEMM_OP_DISPATCH()

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                FBGEMM_OP_DISPATCH (CUDA ,
                                "permute102_baddbmm_permute102" ,
                                fbgemm_gpu::permute102_baddbmm_permute102_cuda  )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__permute__1d_8cu.html b/sparse__permute__1d_8cu.html new file mode 100644 index 000000000..797c655c7 --- /dev/null +++ b/sparse__permute__1d_8cu.html @@ -0,0 +1,138 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_permute_1d.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_permute_1d.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +

                                Function Documentation

                                + +

                                ◆ FBGEMM_OP_DISPATCH()

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                FBGEMM_OP_DISPATCH (CUDA ,
                                "permute_1D_sparse_data" ,
                                fbgemm_gpu::permute_1D_sparse_data_cuda  )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__permute__2d_8cu.html b/sparse__permute__2d_8cu.html new file mode 100644 index 000000000..07b12e6fb --- /dev/null +++ b/sparse__permute__2d_8cu.html @@ -0,0 +1,197 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_permute_2d.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_permute_2d.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                + + + + +

                                +Functions

                                template<int NUM_JAGGED_DIM, typename index_t , typename scalar_t , typename F >
                                __global__ __launch_bounds__ (kMaxThreads) void jagged_jagged_elementwise_dense_output_kernel_(const pta
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +

                                Function Documentation

                                + +

                                ◆ FBGEMM_OP_DISPATCH() [1/3]

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                FBGEMM_OP_DISPATCH (CUDA ,
                                "permute_2D_sparse_data" ,
                                fbgemm_gpu::permute_2D_sparse_data_cuda  )
                                +
                                + +
                                +
                                + +

                                ◆ FBGEMM_OP_DISPATCH() [2/3]

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                FBGEMM_OP_DISPATCH (CUDA ,
                                "permute_sparse_data" ,
                                fbgemm_gpu::permute_2D_sparse_data_cuda  )
                                +
                                + +
                                +
                                + +

                                ◆ FBGEMM_OP_DISPATCH() [3/3]

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                FBGEMM_OP_DISPATCH (CUDA ,
                                "permute_sparse_features" ,
                                fbgemm_gpu::permute_sparse_features_cuda  )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__permute__embeddings_8cu.html b/sparse__permute__embeddings_8cu.html new file mode 100644 index 000000000..8c022343e --- /dev/null +++ b/sparse__permute__embeddings_8cu.html @@ -0,0 +1,138 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_permute_embeddings.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_permute_embeddings.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +

                                Function Documentation

                                + +

                                ◆ FBGEMM_OP_DISPATCH()

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                FBGEMM_OP_DISPATCH (CUDA ,
                                "permute_sequence_embeddings" ,
                                fbgemm_gpu::permute_sequence_embeddings_cuda  )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__range_8cu.html b/sparse__range_8cu.html new file mode 100644 index 000000000..a6e7dacb7 --- /dev/null +++ b/sparse__range_8cu.html @@ -0,0 +1,164 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_range.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_range.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +

                                Function Documentation

                                + +

                                ◆ FBGEMM_OP_DISPATCH() [1/2]

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                FBGEMM_OP_DISPATCH (CUDA ,
                                "lengths_range" ,
                                fbgemm_gpu::lengths_range_cuda  )
                                +
                                + +
                                +
                                + +

                                ◆ FBGEMM_OP_DISPATCH() [2/2]

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                FBGEMM_OP_DISPATCH (CUDA ,
                                "offsets_range" ,
                                fbgemm_gpu::offsets_range_cuda  )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__reorder__batched__ad_8cu.html b/sparse__reorder__batched__ad_8cu.html new file mode 100644 index 000000000..b43f623f2 --- /dev/null +++ b/sparse__reorder__batched__ad_8cu.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_reorder_batched_ad.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_reorder_batched_ad.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__segment__sum__csr_8cu.html b/sparse__segment__sum__csr_8cu.html new file mode 100644 index 000000000..7a9d44823 --- /dev/null +++ b/sparse__segment__sum__csr_8cu.html @@ -0,0 +1,138 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_segment_sum_csr.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_segment_sum_csr.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +

                                Function Documentation

                                + +

                                ◆ FBGEMM_OP_DISPATCH()

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                FBGEMM_OP_DISPATCH (CUDA ,
                                "segment_sum_csr" ,
                                fbgemm_gpu::segment_sum_csr_cuda  )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/sparse__zipf_8cu.html b/sparse__zipf_8cu.html new file mode 100644 index 000000000..286eb8a13 --- /dev/null +++ b/sparse__zipf_8cu.html @@ -0,0 +1,139 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/sparse_ops/sparse_zipf.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                sparse_zipf.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + +

                                +Classes

                                struct  rk_state
                                 
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +

                                Function Documentation

                                + +

                                ◆ TORCH_LIBRARY_FRAGMENT()

                                + +
                                +
                                + + + + + + + + + + + +
                                TORCH_LIBRARY_FRAGMENT (fbgemm ,
                                m  )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/split__embeddings__cache_2common_8cuh.html b/split__embeddings__cache_2common_8cuh.html new file mode 100644 index 000000000..c6e3345a9 --- /dev/null +++ b/split__embeddings__cache_2common_8cuh.html @@ -0,0 +1,129 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_cache/common.cuh File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                common.cuh File Reference
                                +
                                +
                                +
                                #include "common.h"
                                +#include "fbgemm_gpu/cub_namespace_prefix.cuh"
                                +#include <cub/device/device_radix_sort.cuh>
                                +#include <cub/device/device_run_length_encode.cuh>
                                +#include <cub/device/device_select.cuh>
                                +#include <cub/block/block_reduce.cuh>
                                +#include "fbgemm_gpu/cub_namespace_postfix.cuh"
                                +#include <ATen/cuda/CUDAContext.h>
                                +#include <ATen/cuda/CUDAGeneratorImpl.h>
                                +#include <ATen/cuda/detail/KernelUtils.h>
                                +#include <c10/cuda/CUDAGuard.h>
                                +#include <cuda.h>
                                +#include <cuda_runtime.h>
                                +#include <curand_kernel.h>
                                +#include <ATen/cuda/Atomic.cuh>
                                +#include <ATen/cuda/CUDAGraphsUtils.cuh>
                                +#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
                                +#include "fbgemm_gpu/split_embeddings_cache_cuda.cuh"
                                +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/split__embeddings__cache_2common_8h.html b/split__embeddings__cache_2common_8h.html new file mode 100644 index 000000000..66ce0baa1 --- /dev/null +++ b/split__embeddings__cache_2common_8h.html @@ -0,0 +1,121 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_cache/common.h File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                common.h File Reference
                                +
                                +
                                +
                                #include <ATen/ATen.h>
                                +#include <ATen/AccumulateType.h>
                                +#include <ATen/TensorUtils.h>
                                +#include <ATen/core/TensorAccessor.h>
                                +#include <limits>
                                +#include <mutex>
                                +#include "fbgemm_gpu/dispatch_macros.h"
                                +#include "fbgemm_gpu/embedding_common.h"
                                +#include "fbgemm_gpu/fbgemm_tensor_accessor.h"
                                +#include "fbgemm_gpu/ops_utils.h"
                                +#include "fbgemm_gpu/sparse_ops_utils.h"
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/split__embeddings__cache__cuda_8cuh.html b/split__embeddings__cache__cuda_8cuh.html new file mode 100644 index 000000000..18628331e --- /dev/null +++ b/split__embeddings__cache__cuda_8cuh.html @@ -0,0 +1,162 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_cache_cuda.cuh File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                split_embeddings_cache_cuda.cuh File Reference
                                +
                                +
                                +
                                #include <ATen/ATen.h>
                                +
                                + + + +

                                +Namespaces

                                namespace  fbgemm_gpu
                                 
                                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

                                +Functions

                                std::tuple< at::Tensor, at::Tensor, c10::optional< at::Tensor > > get_unique_indices_cuda (at::Tensor linear_indices, int64_t max_indices, bool compute_count)
                                 
                                std::pair< at::Tensor, at::Tensorlru_cache_find_uncached_cuda (at::Tensor unique_indices, at::Tensor unique_indices_length, int64_t max_indices, at::Tensor lxu_cache_state, int64_t time_stamp, at::Tensor lru_state, bool gather_cache_stats, at::Tensor uvm_cache_stats, bool lock_cache_line, at::Tensor lxu_cache_locking_counter)
                                 
                                int64_t host_lxu_cache_slot (int64_t h_in, int64_t C)
                                 
                                at::Tensor linearize_cache_indices_cuda (at::Tensor cache_hash_size_cumsum, at::Tensor indices, at::Tensor offsets)
                                 
                                at::Tensor linearize_cache_indices_from_row_idx_cuda (at::Tensor cache_hash_size_cumsum, at::Tensor update_table_indices, at::Tensor update_row_indices)
                                 
                                void lru_cache_populate_cuda (at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, bool stochastic_rounding, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats, bool lock_cache_line, c10::optional< at::Tensor > lxu_cache_locking_counter)
                                 
                                void lru_cache_populate_byte_cuda (at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, int64_t row_alignment, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats)
                                 
                                void direct_mapped_lru_cache_populate_byte_cuda (at::Tensor weights, at::Tensor hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, int64_t time_stamp, at::Tensor lru_state, at::Tensor lxu_cache_miss_timestamp, int64_t row_alignment, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats)
                                 
                                void lfu_cache_populate_cuda (at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, bool stochastic_rounding)
                                 
                                void lfu_cache_populate_byte_cuda (at::Tensor weights, at::Tensor cache_hash_size_cumsum, int64_t total_cache_hash_size, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor weights_tys, at::Tensor D_offsets, at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, at::Tensor lfu_state, int64_t row_alignment)
                                 
                                at::Tensor lxu_cache_lookup_cuda (at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats, c10::optional< at::Tensor > num_uniq_cache_indices, c10::optional< at::Tensor > lxu_cache_locations_output)
                                 
                                at::Tensor direct_mapped_lxu_cache_lookup_cuda (at::Tensor linear_cache_indices, at::Tensor lxu_cache_state, int64_t invalid_index, bool gather_cache_stats, c10::optional< at::Tensor > uvm_cache_stats)
                                 
                                void lxu_cache_flush_cuda (at::Tensor uvm_weights, at::Tensor cache_hash_size_cumsum, at::Tensor cache_index_table_map, at::Tensor weights_offsets, at::Tensor D_offsets, int64_t total_D, at::Tensor lxu_cache_state, at::Tensor lxu_cache_weights, bool stochastic_rounding)
                                 
                                void lxu_cache_locking_counter_decrement_cuda (at::Tensor lxu_cache_locking_counter, at::Tensor lxu_cache_locations)
                                 
                                void lxu_cache_locations_update_cuda (at::Tensor lxu_cache_locations, at::Tensor lxu_cache_locations_new, c10::optional< at::Tensor > num_uniq_cache_indices)
                                 
                                +

                                Function Documentation

                                + +

                                ◆ emulate_cache_miss()

                                + +
                                +
                                + + + + + + + + + + + + + + + + + + + + + +
                                at::Tensor emulate_cache_miss (at::Tensor lxu_cache_locations,
                                const int64_t enforced_misses_per_256,
                                const bool gather_cache_stats,
                                at::Tensor uvm_cache_stats )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/split__embeddings__cache__ops_8cpp.html b/split__embeddings__cache__ops_8cpp.html new file mode 100644 index 000000000..257d31a75 --- /dev/null +++ b/split__embeddings__cache__ops_8cpp.html @@ -0,0 +1,88 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cpp File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                +
                                split_embeddings_cache_ops.cpp File Reference
                                +
                                +
                                +
                                #include "common.h"
                                +
                                + + + + diff --git a/split__embeddings__cache__ops_8cu.html b/split__embeddings__cache__ops_8cu.html new file mode 100644 index 000000000..3008ac8e6 --- /dev/null +++ b/split__embeddings__cache__ops_8cu.html @@ -0,0 +1,88 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_cache/split_embeddings_cache_ops.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                +
                                split_embeddings_cache_ops.cu File Reference
                                +
                                +
                                +
                                #include "common.cuh"
                                +
                                + + + + diff --git a/split__embeddings__utils_8cpp.html b/split__embeddings__utils_8cpp.html new file mode 100644 index 000000000..0a912eb50 --- /dev/null +++ b/split__embeddings__utils_8cpp.html @@ -0,0 +1,154 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_utils/split_embeddings_utils.cpp File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                +
                                split_embeddings_utils.cpp File Reference
                                +
                                +
                                +
                                #include "fbgemm_gpu/split_embeddings_utils.cuh"
                                +#include "fbgemm_gpu/sparse_ops_utils.h"
                                +#include <ATen/ATen.h>
                                +#include <torch/library.h>
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +

                                Function Documentation

                                + +

                                ◆ TORCH_LIBRARY_FRAGMENT()

                                + +
                                +
                                + + + + + + + + + + + +
                                TORCH_LIBRARY_FRAGMENT (fbgemm ,
                                m  )
                                +
                                + +
                                +
                                + +

                                ◆ TORCH_LIBRARY_IMPL()

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                TORCH_LIBRARY_IMPL (fbgemm ,
                                Meta ,
                                m  )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/split__embeddings__utils_8cuh.html b/split__embeddings__utils_8cuh.html new file mode 100644 index 000000000..d1226e52c --- /dev/null +++ b/split__embeddings__utils_8cuh.html @@ -0,0 +1,530 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/split_embeddings_utils.cuh File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                split_embeddings_utils.cuh File Reference
                                +
                                +
                                +
                                #include <ATen/ATen.h>
                                +#include <cuda.h>
                                +#include <cuda_runtime.h>
                                +#include "fbgemm_gpu/embedding_common.h"
                                +
                                + + + + + +

                                +Functions

                                std::tuple< at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensortranspose_embedding_input (at::Tensor hash_size_cumsum, int64_t total_hash_size_bits, at::Tensor indices, at::Tensor offsets, bool nobag=false, const c10::optional< at::Tensor > &vbe_b_t_map=c10::optional< at::Tensor >(), const int64_t info_B_num_bits=26, const int64_t info_B_mask=0x2FFFFFF, const int64_t total_unique_indices=-1, const bool is_index_select=false, const c10::optional< at::Tensor > &total_L_offsets=c10::optional< at::Tensor >(), const int64_t fixed_L_per_warp=0, const int64_t num_warps_per_feature=0)
                                 
                                std::tuple< at::Tensor, at::Tensorgenerate_vbe_metadata (const at::Tensor &B_offsets, const at::Tensor &B_offsets_rank_per_feature, const at::Tensor &output_offsets_feature_rank, const at::Tensor &D_offsets, const int64_t D, const bool nobag, const int64_t max_B_feature_rank, const int64_t info_B_num_bits, const int64_t total_B)
                                 
                                +

                                Macro Definition Documentation

                                + +

                                ◆ DECL_RADIX_SORT_PAIRS_FN

                                + +
                                +
                                + + + + + + + + + + + +
                                #define DECL_RADIX_SORT_PAIRS_FN( KeyT,
                                ValueT )
                                +
                                +Value:
                                cudaError_t radix_sort_pairs( \
                                +
                                void* d_temp_storage, \
                                +
                                size_t& temp_storage_bytes, \
                                +
                                const KeyT* d_keys_in, \
                                +
                                KeyT* d_keys_out, \
                                +
                                const ValueT* d_values_in, \
                                +
                                ValueT* d_values_out, \
                                +
                                int num_items, \
                                +
                                int begin_bit = 0, \
                                +
                                int end_bit = sizeof(KeyT) * 8, \
                                +
                                cudaStream_t stream = 0)
                                +
                                +
                                +
                                +

                                Function Documentation

                                + +

                                ◆ adjust_info_B_num_bits()

                                + +
                                +
                                + + + + + + + + + + + +
                                std::tuple< int32_t, uint32_t > adjust_info_B_num_bits (int32_t B,
                                int32_t T )
                                +
                                + +
                                +
                                + +

                                ◆ DECL_RADIX_SORT_PAIRS_FN() [1/4]

                                + +
                                +
                                + + + + + + + + + + + +
                                DECL_RADIX_SORT_PAIRS_FN (int64_t ,
                                double  )
                                +
                                + +
                                +
                                + +

                                ◆ DECL_RADIX_SORT_PAIRS_FN() [2/4]

                                + +
                                +
                                + + + + + + + + + + + +
                                DECL_RADIX_SORT_PAIRS_FN (int64_t ,
                                float  )
                                +
                                + +
                                +
                                + +

                                ◆ DECL_RADIX_SORT_PAIRS_FN() [3/4]

                                + +
                                +
                                + + + + + + + + + + + +
                                DECL_RADIX_SORT_PAIRS_FN (int64_t ,
                                int32_t  )
                                +
                                + +
                                +
                                + +

                                ◆ DECL_RADIX_SORT_PAIRS_FN() [4/4]

                                + +
                                +
                                + + + + + + + + + + + +
                                DECL_RADIX_SORT_PAIRS_FN (int64_t ,
                                int64_t  )
                                +
                                + +
                                +
                                + +

                                ◆ generate_vbe_metadata()

                                + +
                                +
                                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                std::tuple< at::Tensor, at::Tensor > generate_vbe_metadata (const Tensor & B_offsets,
                                const Tensor & B_offsets_rank_per_feature,
                                const Tensor & output_offsets_feature_rank,
                                const Tensor & D_offsets,
                                const int64_t D,
                                const bool nobag,
                                const int64_t max_B_feature_rank,
                                const int64_t info_B_num_bits,
                                const int64_t total_B )
                                +
                                +

                                Generate VBE metadata namely output_offsets and b_t_map

                                +

                                row_output_offsets A 1D tensor that contains the output offset of each b (sample) and t (feature/table) pair. The output serializes O_r_t where O_r_t is the local output of rank r and feature/table t (t is the fastest moving index). b_t_map A 1D tensor that contains the b and t information of the linearized b and t (b is the fastest moving index).

                                +
                                Parameters
                                + + + + + + + + + + +
                                B_offsetsBatch size offsets for all features.
                                B_offsets_rank_per_featureBatch size offsets for all ranks (GPUs) for each feature.
                                output_offsets_feature_rankOutput offsets for all features and ranks and features.
                                D_offsetsEmbedding dimension offsets. Required if nobag is false.
                                DThe embedding dimension. Required if nobag is true.
                                nobagA boolean to indicate if TBE is pooled (false) or sequence (true).
                                max_B_feature_rankMaximum number of batches for feature ranking
                                info_B_num_bitsThe number of bits used to encode a sample ID. (Used for populating b_t_map).
                                total_BThe total number of samples (i.e., the total number of b and t pairs).
                                +
                                +
                                + +
                                +
                                + +

                                ◆ get_infos_metadata()

                                + +
                                +
                                + + + + + + + + + + + + + + + + +
                                std::tuple< int64_t, int64_t > get_infos_metadata (at::Tensor unused,
                                int64_t B,
                                int64_t T )
                                +
                                + +
                                +
                                + +

                                ◆ transpose_embedding_input()

                                + +
                                +
                                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                std::tuple< at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor > transpose_embedding_input (at::Tensor hash_size_cumsum,
                                int64_t total_hash_size_bits,
                                at::Tensor indices,
                                at::Tensor offsets,
                                bool nobag = false,
                                const c10::optional< at::Tensor > & vbe_b_t_map = c10::optional< at::Tensor >(),
                                const int64_t info_B_num_bits = 26,
                                const int64_t info_B_mask = 0x2FFFFFF,
                                const int64_t total_unique_indices = -1,
                                const bool is_index_select = false,
                                const c10::optional< at::Tensor > & total_L_offsets = c10::optional< at::Tensor >(),
                                const int64_t fixed_L_per_warp = 0,
                                const int64_t num_warps_per_feature = 0 )
                                +
                                +

                                "Transpose" embedding inputs by sorting indices by their values. Logically this transpose compressed sparse row (CSR) representation stored in indices and offsets to compressed sparse column (CSC).

                                + +
                                +
                                +

                                Variable Documentation

                                + +

                                ◆ DEFAULT_INFO_B_MASK

                                + +
                                +
                                + + + + + +
                                + + + + +
                                constexpr uint32_t DEFAULT_INFO_B_MASK = (1u << DEFAULT_INFO_B_NUM_BITS) - 1
                                +
                                +constexpr
                                +
                                + +
                                +
                                + +

                                ◆ DEFAULT_INFO_B_NUM_BITS

                                + +
                                +
                                + + + + + +
                                + + + + +
                                constexpr int DEFAULT_INFO_B_NUM_BITS = 26
                                +
                                +constexpr
                                +
                                + +
                                +
                                + +

                                ◆ DEFAULT_INFO_NUM_BITS

                                + +
                                +
                                + + + + + +
                                + + + + +
                                constexpr int DEFAULT_INFO_NUM_BITS = 32
                                +
                                +constexpr
                                +
                                + +
                                +
                                + +

                                ◆ MAX_B

                                + +
                                +
                                + + + + + +
                                + + + + +
                                constexpr uint32_t MAX_B = (1u << DEFAULT_INFO_B_NUM_BITS) - 1
                                +
                                +constexpr
                                +
                                + +
                                +
                                + +

                                ◆ MAX_T

                                + +
                                +
                                + + + + + +
                                + + + + +
                                constexpr uint32_t MAX_T
                                +
                                +constexpr
                                +
                                +Initial value:
                                =
                                + +
                                constexpr int DEFAULT_INFO_NUM_BITS
                                Definition split_embeddings_utils.cuh:17
                                +
                                constexpr int DEFAULT_INFO_B_NUM_BITS
                                Definition split_embeddings_utils.cuh:18
                                +
                                +
                                +
                                +
                                + + + + diff --git a/ssd__split__embeddings__cache__cuda_8cu.html b/ssd__split__embeddings__cache__cuda_8cu.html new file mode 100644 index 000000000..06a80e494 --- /dev/null +++ b/ssd__split__embeddings__cache__cuda_8cu.html @@ -0,0 +1,224 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_embeddings_cache_cuda.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                +
                                ssd_split_embeddings_cache_cuda.cu File Reference
                                +
                                +
                                +
                                #include <ATen/ATen.h>
                                +#include <ATen/core/TensorAccessor.h>
                                +#include <ATen/cuda/CUDAContext.h>
                                +#include <c10/cuda/CUDADeviceAssertion.h>
                                +#include <c10/cuda/CUDADeviceAssertionHost.h>
                                +#include <c10/cuda/CUDAGuard.h>
                                +#include <ATen/cuda/Atomic.cuh>
                                +#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
                                +#include "fbgemm_gpu/sparse_ops_utils.h"
                                +#include "fbgemm_gpu/split_embeddings_cache_cuda.cuh"
                                +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                                +

                                Typedef Documentation

                                + +

                                ◆ Tensor

                                + +
                                +
                                + + + + +
                                using Tensor = at::Tensor
                                +
                                + +
                                +
                                +

                                Function Documentation

                                + +

                                ◆ __launch_bounds__() [1/2]

                                + +
                                +
                                +
                                +template<typename scalar_t >
                                + + + + + + + +
                                __global__ __launch_bounds__ (kMaxThreads )
                                +
                                + +
                                +
                                + +

                                ◆ __launch_bounds__() [2/2]

                                + +
                                +
                                +
                                +template<>
                                + + + + + + + +
                                __global__ __launch_bounds__ (kMaxThreads )
                                +
                                + +
                                +
                                + +

                                ◆ masked_index_put_cuda()

                                + +
                                +
                                + + + + + + + + + + + + + + + + + + + + + +
                                Tensor masked_index_put_cuda (Tensor self,
                                Tensor indices,
                                Tensor values,
                                Tensor count )
                                +
                                + +
                                +
                                + +

                                ◆ ssd_cache_populate_actions_cuda()

                                + +
                                +
                                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                std::tuple< Tensor, Tensor, Tensor, Tensor > ssd_cache_populate_actions_cuda (Tensor linear_indices,
                                int64_t total_hash_size,
                                Tensor lxu_cache_state,
                                int64_t time_stamp,
                                int64_t prefetch_dist,
                                Tensor lru_state )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/ssd__split__table__batched__embeddings_8cpp.html b/ssd__split__table__batched__embeddings_8cpp.html new file mode 100644 index 000000000..863728263 --- /dev/null +++ b/ssd__split__table__batched__embeddings_8cpp.html @@ -0,0 +1,197 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_split_table_batched_embeddings.cpp File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                +
                                ssd_split_table_batched_embeddings.cpp File Reference
                                +
                                +
                                +
                                #include <ATen/ATen.h>
                                +#include <ATen/core/op_registration/op_registration.h>
                                +#include <torch/library.h>
                                +#include <torch/custom_class.h>
                                +#include "./ssd_table_batched_embeddings.h"
                                +#include "fbgemm_gpu/sparse_ops_utils.h"
                                +

                                Function Documentation

                                + +

                                ◆ masked_index_put_byte_cuda()

                                + +
                                +
                                + + + + + + + + + + + + + + + + + + + + + +
                                Tensor masked_index_put_byte_cuda (Tensor self,
                                Tensor indices,
                                Tensor values,
                                Tensor count )
                                +
                                + +
                                +
                                + +

                                ◆ masked_index_put_cuda()

                                + +
                                +
                                + + + + + + + + + + + + + + + + + + + + + +
                                Tensor masked_index_put_cuda (Tensor self,
                                Tensor indices,
                                Tensor values,
                                Tensor count )
                                +
                                + +
                                +
                                + +

                                ◆ ssd_cache_populate_actions_cuda()

                                + +
                                +
                                + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                                std::tuple< Tensor, Tensor, Tensor, Tensor > ssd_cache_populate_actions_cuda (Tensor linear_indices,
                                int64_t total_hash_size,
                                Tensor lxu_cache_state,
                                int64_t time_stamp,
                                int64_t prefetch_dist,
                                Tensor lru_state )
                                +
                                + +
                                +
                                +
                                + + + + diff --git a/ssd__table__batched__embeddings_8h.html b/ssd__table__batched__embeddings_8h.html new file mode 100644 index 000000000..6d4abbf68 --- /dev/null +++ b/ssd__table__batched__embeddings_8h.html @@ -0,0 +1,122 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/ssd_split_embeddings_cache/ssd_table_batched_embeddings.h File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                ssd_table_batched_embeddings.h File Reference
                                +
                                +
                                +
                                #include <random>
                                +#include <ATen/ATen.h>
                                +#include <ATen/record_function.h>
                                +#include <folly/container/F14Map.h>
                                +#include <glog/logging.h>
                                +#include <folly/Random.h>
                                +#include <folly/concurrency/UnboundedQueue.h>
                                +#include <folly/executors/CPUThreadPoolExecutor.h>
                                +#include <folly/futures/Future.h>
                                +#include <folly/hash/Hash.h>
                                +#include <rocksdb/cache.h>
                                +#include <rocksdb/db.h>
                                +#include <rocksdb/filter_policy.h>
                                +#include <rocksdb/rate_limiter.h>
                                +#include <rocksdb/slice_transform.h>
                                +#include <rocksdb/table.h>
                                +#include <rocksdb/table_properties.h>
                                +#include <ATen/cuda/CUDAContext.h>
                                +#include <cuda_runtime.h>
                                +
                                + + + + + +

                                +Classes

                                class  Initializer
                                 
                                class  EmbeddingRocksDB
                                 
                                + + + +

                                +Namespaces

                                namespace  ssd
                                 
                                +
                                + + + + diff --git a/stacked__jagged__1d__to__dense_8cu.html b/stacked__jagged__1d__to__dense_8cu.html new file mode 100644 index 000000000..5d39b7445 --- /dev/null +++ b/stacked__jagged__1d__to__dense_8cu.html @@ -0,0 +1,87 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/stacked_jagged_1d_to_dense.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                +
                                stacked_jagged_1d_to_dense.cu File Reference
                                +
                                +
                                +
                                + + + + diff --git a/stacked__jagged__2d__to__dense_8cu.html b/stacked__jagged__2d__to__dense_8cu.html new file mode 100644 index 000000000..d43cf0df2 --- /dev/null +++ b/stacked__jagged__2d__to__dense_8cu.html @@ -0,0 +1,87 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/jagged_tensor_ops/stacked_jagged_2d_to_dense.cu File Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                +
                                stacked_jagged_2d_to_dense.cu File Reference
                                +
                                +
                                +
                                + + + + diff --git a/struct_stack_array-members.html b/struct_stack_array-members.html new file mode 100644 index 000000000..cf4268836 --- /dev/null +++ b/struct_stack_array-members.html @@ -0,0 +1,88 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + +
                                +
                                +
                                StackArray< T > Member List
                                +
                                +
                                + +

                                This is the complete list of members for StackArray< T >, including all inherited members.

                                + + + +
                                ndimStackArray< T >
                                valsStackArray< T >
                                + + + + diff --git a/struct_stack_array.html b/struct_stack_array.html new file mode 100644 index 000000000..467d514b7 --- /dev/null +++ b/struct_stack_array.html @@ -0,0 +1,123 @@ + + + + + + + +fbgemm_gpu: StackArray< T > Struct Template Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + +
                                +
                                + +
                                StackArray< T > Struct Template Reference
                                +
                                +
                                + +

                                #include <sparse_ops_utils.h>

                                +

                                Member Data Documentation

                                + +

                                ◆ ndim

                                + +
                                +
                                +
                                +template<typename T >
                                + + + + +
                                size_t ndim
                                +
                                + +
                                +
                                + +

                                ◆ vals

                                + +
                                +
                                +
                                +template<typename T >
                                + + + + +
                                T vals[kStackArrayMaxDims]
                                +
                                + +
                                +
                                +
                                The documentation for this struct was generated from the following file: +
                                + + + + diff --git a/struct_vec4_type.html b/struct_vec4_type.html new file mode 100644 index 000000000..a76f724ab --- /dev/null +++ b/struct_vec4_type.html @@ -0,0 +1,86 @@ + + + + + + + +fbgemm_gpu: Vec4Type< T > Struct Template Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + +
                                +
                                +
                                Vec4Type< T > Struct Template Reference
                                +
                                +
                                +
                                The documentation for this struct was generated from the following file: +
                                + + + + diff --git a/struct_vec4_type_3_01at_1_1_half_01_4-members.html b/struct_vec4_type_3_01at_1_1_half_01_4-members.html new file mode 100644 index 000000000..75b23dd70 --- /dev/null +++ b/struct_vec4_type_3_01at_1_1_half_01_4-members.html @@ -0,0 +1,89 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + +
                                +
                                +
                                Vec4Type< at::Half > Member List
                                +
                                +
                                + +

                                This is the complete list of members for Vec4Type< at::Half >, including all inherited members.

                                + + + + +
                                type typedefVec4Type< at::Half >
                                type typedefVec4Type< at::Half >
                                type typedefVec4Type< at::Half >
                                + + + + diff --git a/struct_vec4_type_3_01at_1_1_half_01_4.html b/struct_vec4_type_3_01at_1_1_half_01_4.html new file mode 100644 index 000000000..404a9b32d --- /dev/null +++ b/struct_vec4_type_3_01at_1_1_half_01_4.html @@ -0,0 +1,133 @@ + + + + + + + +fbgemm_gpu: Vec4Type< at::Half > Struct Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + +
                                +
                                + +
                                Vec4Type< at::Half > Struct Reference
                                +
                                +
                                +

                                Member Typedef Documentation

                                + +

                                ◆ type [1/3]

                                + +
                                +
                                + + + + +
                                using type = float2
                                +
                                + +
                                +
                                + +

                                ◆ type [2/3]

                                + +
                                +
                                + + + + +
                                using type = float2
                                +
                                + +
                                +
                                + +

                                ◆ type [3/3]

                                + +
                                +
                                + + + + +
                                using type = float2
                                +
                                + +
                                +
                                +
                                The documentation for this struct was generated from the following files: +
                                + + + + diff --git a/struct_vec4_type_3_01float_01_4-members.html b/struct_vec4_type_3_01float_01_4-members.html new file mode 100644 index 000000000..545c39a24 --- /dev/null +++ b/struct_vec4_type_3_01float_01_4-members.html @@ -0,0 +1,89 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + +
                                +
                                +
                                Vec4Type< float > Member List
                                +
                                +
                                + +

                                This is the complete list of members for Vec4Type< float >, including all inherited members.

                                + + + + +
                                type typedefVec4Type< float >
                                type typedefVec4Type< float >
                                type typedefVec4Type< float >
                                + + + + diff --git a/struct_vec4_type_3_01float_01_4.html b/struct_vec4_type_3_01float_01_4.html new file mode 100644 index 000000000..cbeb62bb8 --- /dev/null +++ b/struct_vec4_type_3_01float_01_4.html @@ -0,0 +1,133 @@ + + + + + + + +fbgemm_gpu: Vec4Type< float > Struct Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + +
                                +
                                + +
                                Vec4Type< float > Struct Reference
                                +
                                +
                                +

                                Member Typedef Documentation

                                + +

                                ◆ type [1/3]

                                + +
                                +
                                + + + + +
                                using type = float4
                                +
                                + +
                                +
                                + +

                                ◆ type [2/3]

                                + +
                                +
                                + + + + +
                                using type = float4
                                +
                                + +
                                +
                                + +

                                ◆ type [3/3]

                                + +
                                +
                                + + + + +
                                using type = float4
                                +
                                + +
                                +
                                +
                                The documentation for this struct was generated from the following files: +
                                + + + + diff --git a/struct_vec4_type_3_01uint8__t_01_4-members.html b/struct_vec4_type_3_01uint8__t_01_4-members.html new file mode 100644 index 000000000..0714d3464 --- /dev/null +++ b/struct_vec4_type_3_01uint8__t_01_4-members.html @@ -0,0 +1,89 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + +
                                +
                                +
                                Vec4Type< uint8_t > Member List
                                +
                                +
                                + +

                                This is the complete list of members for Vec4Type< uint8_t >, including all inherited members.

                                + + + + +
                                type typedefVec4Type< uint8_t >
                                type typedefVec4Type< uint8_t >
                                type typedefVec4Type< uint8_t >
                                + + + + diff --git a/struct_vec4_type_3_01uint8__t_01_4.html b/struct_vec4_type_3_01uint8__t_01_4.html new file mode 100644 index 000000000..3bcfea491 --- /dev/null +++ b/struct_vec4_type_3_01uint8__t_01_4.html @@ -0,0 +1,133 @@ + + + + + + + +fbgemm_gpu: Vec4Type< uint8_t > Struct Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + +
                                +
                                + +
                                Vec4Type< uint8_t > Struct Reference
                                +
                                +
                                +

                                Member Typedef Documentation

                                + +

                                ◆ type [1/3]

                                + +
                                +
                                + + + + +
                                using type = uint8_t
                                +
                                + +
                                +
                                + +

                                ◆ type [2/3]

                                + +
                                +
                                + + + + +
                                using type = uint8_t
                                +
                                + +
                                +
                                + +

                                ◆ type [3/3]

                                + +
                                +
                                + + + + +
                                using type = uint8_t
                                +
                                + +
                                +
                                +
                                The documentation for this struct was generated from the following files: +
                                + + + + diff --git a/structfbgemm__gpu_1_1_bitonic_sort-members.html b/structfbgemm__gpu_1_1_bitonic_sort-members.html new file mode 100644 index 000000000..609cdf028 --- /dev/null +++ b/structfbgemm__gpu_1_1_bitonic_sort-members.html @@ -0,0 +1,91 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                +
                                BitonicSort< K, V, Dir, Comp > Member List
                                +
                                +
                                + +

                                This is the complete list of members for BitonicSort< K, V, Dir, Comp >, including all inherited members.

                                + + +
                                sort(K k[1], V v[1])BitonicSort< K, V, Dir, Comp >inlinestatic
                                + + + + diff --git a/structfbgemm__gpu_1_1_bitonic_sort.html b/structfbgemm__gpu_1_1_bitonic_sort.html new file mode 100644 index 000000000..20e3c9d8e --- /dev/null +++ b/structfbgemm__gpu_1_1_bitonic_sort.html @@ -0,0 +1,124 @@ + + + + + + + +fbgemm_gpu: BitonicSort< K, V, Dir, Comp > Struct Template Reference + + + + + + + + + + + +
                                +
                                + + + + + + +
                                +
                                fbgemm_gpu +
                                +
                                +
                                + + + + + + + + +
                                +
                                + + +
                                +
                                +
                                +
                                +
                                +
                                Loading...
                                +
                                Searching...
                                +
                                No Matches
                                +
                                +
                                +
                                +
                                + + +
                                +
                                + +
                                BitonicSort< K, V, Dir, Comp > Struct Template Reference
                                +
                                +
                                +

                                Member Function Documentation

                                + +

                                ◆ sort()

                                + +
                                +
                                +
                                +template<typename K , typename V , bool Dir, typename Comp >
                                + + + + + +
                                + + + + + + + + + + + +
                                static __device__ void sort (K k[1],
                                V v[1] )
                                +
                                +inlinestatic
                                +
                                + +
                                +
                                +
                                The documentation for this struct was generated from the following file: +
                                + + + + diff --git a/structfbgemm__gpu_1_1_comparator-members.html b/structfbgemm__gpu_1_1_comparator-members.html index d10517b43..39c3b5506 100644 --- a/structfbgemm__gpu_1_1_comparator-members.html +++ b/structfbgemm__gpu_1_1_comparator-members.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: Member List + + @@ -29,7 +31,7 @@

                      - + @@ -69,7 +71,7 @@
                    @@ -78,10 +80,13 @@

                    This is the complete list of members for Comparator< T >, including all inherited members.

                    -
                    + + + +
                    gt(T a, T b)Comparator< T >inlinestatic
                    lt(T a, T b)Comparator< T >inlinestatic
                    diff --git a/structfbgemm__gpu_1_1_comparator.html b/structfbgemm__gpu_1_1_comparator.html index 458388948..1ac9ea2f1 100644 --- a/structfbgemm__gpu_1_1_comparator.html +++ b/structfbgemm__gpu_1_1_comparator.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: Comparator< T > Struct Template Reference + + @@ -29,7 +31,7 @@ - + @@ -69,7 +71,7 @@

                  @@ -78,13 +80,76 @@
                  Comparator< T > Struct Template Reference
                  +

                  Member Function Documentation

                  + +

                  ◆ gt()

                  + +
                  +
                  +
                  +template<typename T >
                  + + + + + +
                  + + + + + + + + + + + +
                  static __device__ bool gt (T a,
                  T b )
                  +
                  +inlinestatic
                  +
                  + +
                  +
                  + +

                  ◆ lt()

                  + +
                  +
                  +
                  +template<typename T >
                  + + + + + +
                  + + + + + + + + + + + +
                  static __device__ bool lt (T a,
                  T b )
                  +
                  +inlinestatic
                  +
                  + +
                  +

                  The documentation for this struct was generated from the following file:
                    -
                  • /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
                  • +
                  • /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/fbgemm_cuda_utils.cuh
                  diff --git a/structfbgemm__gpu_1_1_default_ptr_traits-members.html b/structfbgemm__gpu_1_1_default_ptr_traits-members.html new file mode 100644 index 000000000..bd2555b2c --- /dev/null +++ b/structfbgemm__gpu_1_1_default_ptr_traits-members.html @@ -0,0 +1,91 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  DefaultPtrTraits< T > Member List
                  +
                  +
                  + +

                  This is the complete list of members for DefaultPtrTraits< T >, including all inherited members.

                  + + +
                  PtrType typedefDefaultPtrTraits< T >
                  + + + + diff --git a/structfbgemm__gpu_1_1_default_ptr_traits.html b/structfbgemm__gpu_1_1_default_ptr_traits.html new file mode 100644 index 000000000..a66c577c0 --- /dev/null +++ b/structfbgemm__gpu_1_1_default_ptr_traits.html @@ -0,0 +1,111 @@ + + + + + + + +fbgemm_gpu: DefaultPtrTraits< T > Struct Template Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  DefaultPtrTraits< T > Struct Template Reference
                  +
                  +
                  + +

                  #include <fbgemm_tensor_accessor.h>

                  +

                  Member Typedef Documentation

                  + +

                  ◆ PtrType

                  + +
                  +
                  +
                  +template<typename T >
                  + + + + +
                  typedef T* PtrType
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_half4-members.html b/structfbgemm__gpu_1_1_half4-members.html new file mode 100644 index 000000000..9b0b1bc9f --- /dev/null +++ b/structfbgemm__gpu_1_1_half4-members.html @@ -0,0 +1,93 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  Half4 Member List
                  +
                  +
                  + +

                  This is the complete list of members for Half4, including all inherited members.

                  + + + + +
                  aHalf4
                  bHalf4
                  store(at::Half *p)Half4inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_half4.html b/structfbgemm__gpu_1_1_half4.html new file mode 100644 index 000000000..e0efc2346 --- /dev/null +++ b/structfbgemm__gpu_1_1_half4.html @@ -0,0 +1,147 @@ + + + + + + + +fbgemm_gpu: Half4 Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  Half4 Struct Reference
                  +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ store()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  __device__ void store (at::Half * p)
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Data Documentation

                  + +

                  ◆ a

                  + +
                  +
                  + + + + +
                  half2 a
                  +
                  + +
                  +
                  + +

                  ◆ b

                  + +
                  +
                  + + + + +
                  half2 b
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_shared_memory.html b/structfbgemm__gpu_1_1_shared_memory.html new file mode 100644 index 000000000..a1fc1857b --- /dev/null +++ b/structfbgemm__gpu_1_1_shared_memory.html @@ -0,0 +1,90 @@ + + + + + + + +fbgemm_gpu: SharedMemory< T > Struct Template Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  SharedMemory< T > Struct Template Reference
                  +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01double_00_01true_01_4_01_4_01_4-members.html b/structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01double_00_01true_01_4_01_4_01_4-members.html new file mode 100644 index 000000000..1c4d6e133 --- /dev/null +++ b/structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01double_00_01true_01_4_01_4_01_4-members.html @@ -0,0 +1,91 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  SharedMemory< Vec4T< at::acc_type< double, true > > > Member List
                  +
                  +
                  + +

                  This is the complete list of members for SharedMemory< Vec4T< at::acc_type< double, true > > >, including all inherited members.

                  + + +
                  getPointer()SharedMemory< Vec4T< at::acc_type< double, true > > >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01double_00_01true_01_4_01_4_01_4.html b/structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01double_00_01true_01_4_01_4_01_4.html new file mode 100644 index 000000000..3075bb93f --- /dev/null +++ b/structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01double_00_01true_01_4_01_4_01_4.html @@ -0,0 +1,118 @@ + + + + + + + +fbgemm_gpu: SharedMemory< Vec4T< at::acc_type< double, true > > > Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  SharedMemory< Vec4T< at::acc_type< double, true > > > Struct Reference
                  +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ getPointer()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  __device__ Vec4T< at::acc_type< double, true > > * getPointer ()
                  +
                  +inline
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01float_00_01true_01_4_01_4_01_4-members.html b/structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01float_00_01true_01_4_01_4_01_4-members.html new file mode 100644 index 000000000..38cf51354 --- /dev/null +++ b/structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01float_00_01true_01_4_01_4_01_4-members.html @@ -0,0 +1,91 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  SharedMemory< Vec4T< at::acc_type< float, true > > > Member List
                  +
                  +
                  + +

                  This is the complete list of members for SharedMemory< Vec4T< at::acc_type< float, true > > >, including all inherited members.

                  + + +
                  getPointer()SharedMemory< Vec4T< at::acc_type< float, true > > >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01float_00_01true_01_4_01_4_01_4.html b/structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01float_00_01true_01_4_01_4_01_4.html new file mode 100644 index 000000000..790d09845 --- /dev/null +++ b/structfbgemm__gpu_1_1_shared_memory_3_01_vec4_t_3_01at_1_1acc__type_3_01float_00_01true_01_4_01_4_01_4.html @@ -0,0 +1,118 @@ + + + + + + + +fbgemm_gpu: SharedMemory< Vec4T< at::acc_type< float, true > > > Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  SharedMemory< Vec4T< at::acc_type< float, true > > > Struct Reference
                  +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ getPointer()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  __device__ Vec4T< at::acc_type< float, true > > * getPointer ()
                  +
                  +inline
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_shared_memory_3_01double_01_4-members.html b/structfbgemm__gpu_1_1_shared_memory_3_01double_01_4-members.html new file mode 100644 index 000000000..abb155f7f --- /dev/null +++ b/structfbgemm__gpu_1_1_shared_memory_3_01double_01_4-members.html @@ -0,0 +1,91 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  SharedMemory< double > Member List
                  +
                  +
                  + +

                  This is the complete list of members for SharedMemory< double >, including all inherited members.

                  + + +
                  getPointer()SharedMemory< double >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_shared_memory_3_01double_01_4.html b/structfbgemm__gpu_1_1_shared_memory_3_01double_01_4.html new file mode 100644 index 000000000..e6dc28e81 --- /dev/null +++ b/structfbgemm__gpu_1_1_shared_memory_3_01double_01_4.html @@ -0,0 +1,118 @@ + + + + + + + +fbgemm_gpu: SharedMemory< double > Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  SharedMemory< double > Struct Reference
                  +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ getPointer()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  __device__ double * getPointer ()
                  +
                  +inline
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_shared_memory_3_01float_01_4-members.html b/structfbgemm__gpu_1_1_shared_memory_3_01float_01_4-members.html new file mode 100644 index 000000000..63c7d528d --- /dev/null +++ b/structfbgemm__gpu_1_1_shared_memory_3_01float_01_4-members.html @@ -0,0 +1,91 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  SharedMemory< float > Member List
                  +
                  +
                  + +

                  This is the complete list of members for SharedMemory< float >, including all inherited members.

                  + + +
                  getPointer()SharedMemory< float >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_shared_memory_3_01float_01_4.html b/structfbgemm__gpu_1_1_shared_memory_3_01float_01_4.html new file mode 100644 index 000000000..ecfd0a3ef --- /dev/null +++ b/structfbgemm__gpu_1_1_shared_memory_3_01float_01_4.html @@ -0,0 +1,118 @@ + + + + + + + +fbgemm_gpu: SharedMemory< float > Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  SharedMemory< float > Struct Reference
                  +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ getPointer()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  __device__ float * getPointer ()
                  +
                  +inline
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_shared_memory_3_01int32__t_01_4-members.html b/structfbgemm__gpu_1_1_shared_memory_3_01int32__t_01_4-members.html new file mode 100644 index 000000000..21f7840de --- /dev/null +++ b/structfbgemm__gpu_1_1_shared_memory_3_01int32__t_01_4-members.html @@ -0,0 +1,91 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  SharedMemory< int32_t > Member List
                  +
                  +
                  + +

                  This is the complete list of members for SharedMemory< int32_t >, including all inherited members.

                  + + +
                  getPointer()SharedMemory< int32_t >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_shared_memory_3_01int32__t_01_4.html b/structfbgemm__gpu_1_1_shared_memory_3_01int32__t_01_4.html new file mode 100644 index 000000000..badcb331c --- /dev/null +++ b/structfbgemm__gpu_1_1_shared_memory_3_01int32__t_01_4.html @@ -0,0 +1,118 @@ + + + + + + + +fbgemm_gpu: SharedMemory< int32_t > Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  SharedMemory< int32_t > Struct Reference
                  +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ getPointer()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  __device__ int32_t * getPointer ()
                  +
                  +inline
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_shared_memory_3_01int64__t_01_4-members.html b/structfbgemm__gpu_1_1_shared_memory_3_01int64__t_01_4-members.html new file mode 100644 index 000000000..b08069c3c --- /dev/null +++ b/structfbgemm__gpu_1_1_shared_memory_3_01int64__t_01_4-members.html @@ -0,0 +1,91 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  SharedMemory< int64_t > Member List
                  +
                  +
                  + +

                  This is the complete list of members for SharedMemory< int64_t >, including all inherited members.

                  + + +
                  getPointer()SharedMemory< int64_t >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_shared_memory_3_01int64__t_01_4.html b/structfbgemm__gpu_1_1_shared_memory_3_01int64__t_01_4.html new file mode 100644 index 000000000..efc2c1ac6 --- /dev/null +++ b/structfbgemm__gpu_1_1_shared_memory_3_01int64__t_01_4.html @@ -0,0 +1,118 @@ + + + + + + + +fbgemm_gpu: SharedMemory< int64_t > Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  SharedMemory< int64_t > Struct Reference
                  +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ getPointer()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  __device__ int64_t * getPointer ()
                  +
                  +inline
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_stochastic_rounding_r_n_g_state-members.html b/structfbgemm__gpu_1_1_stochastic_rounding_r_n_g_state-members.html new file mode 100644 index 000000000..fb1c44998 --- /dev/null +++ b/structfbgemm__gpu_1_1_stochastic_rounding_r_n_g_state-members.html @@ -0,0 +1,91 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  StochasticRoundingRNGState Member List
                  +
                  +
                  + +

                  This is the complete list of members for StochasticRoundingRNGState, including all inherited members.

                  + + +
                  aStochasticRoundingRNGState
                  + + + + diff --git a/structfbgemm__gpu_1_1_stochastic_rounding_r_n_g_state.html b/structfbgemm__gpu_1_1_stochastic_rounding_r_n_g_state.html new file mode 100644 index 000000000..4e379e355 --- /dev/null +++ b/structfbgemm__gpu_1_1_stochastic_rounding_r_n_g_state.html @@ -0,0 +1,107 @@ + + + + + + + +fbgemm_gpu: StochasticRoundingRNGState Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  StochasticRoundingRNGState Struct Reference
                  +
                  +
                  +

                  Member Data Documentation

                  + +

                  ◆ a

                  + +
                  +
                  + + + + +
                  uint64_t a
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec4_acc_t-members.html b/structfbgemm__gpu_1_1_vec4_acc_t-members.html new file mode 100644 index 000000000..485f16a22 --- /dev/null +++ b/structfbgemm__gpu_1_1_vec4_acc_t-members.html @@ -0,0 +1,109 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  Vec4AccT Member List
                  +
                  +
                  + +

                  This is the complete list of members for Vec4AccT, including all inherited members.

                  + + + + + + + + + + + + + + + + + + + + +
                  accVec4AccT
                  add(const float4 *ptr)Vec4AccTinline
                  add(const float2 *ptr)Vec4AccTinline
                  add(const uint8_t *ptr)Vec4AccTinline
                  add_(const float *vals)Vec4AccTinline
                  add_(const half2 *vals_h)Vec4AccTinline
                  div(uint32_t denom)Vec4AccTinline
                  fma(const float4 *ptr, const float weight)Vec4AccTinline
                  fma(const float2 *ptr, const float weight)Vec4AccTinline
                  fma(const uint8_t *ptr, const float weight)Vec4AccTinline
                  fma_(const float *vals, const float weight)Vec4AccTinline
                  fma_(const half *vals, const float weight)Vec4AccTinline
                  reset()Vec4AccTinline
                  store(float4 *ptr)Vec4AccTinline
                  store(float2 *ptr)Vec4AccTinline
                  store(uint8_t *ptr)Vec4AccTinline
                  store_(const float4 *src, float4 *dst)Vec4AccTinline
                  store_(const float4 *src, float2 *dst)Vec4AccTinline
                  Vec4AccT()Vec4AccTinline
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec4_acc_t.html b/structfbgemm__gpu_1_1_vec4_acc_t.html new file mode 100644 index 000000000..fd3e5f3bf --- /dev/null +++ b/structfbgemm__gpu_1_1_vec4_acc_t.html @@ -0,0 +1,599 @@ + + + + + + + +fbgemm_gpu: Vec4AccT Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  Vec4AccT Struct Reference
                  +
                  +
                  +
                  +Inheritance diagram for Vec4AccT:
                  +
                  +
                  + + +Vec4StepT< STEP, input_t > +Vec4StepT< STEP, at::Half > +Vec4StepT< STEP, float > +Vec4StepT< STEP, uint8_t > + +
                  +

                  Constructor & Destructor Documentation

                  + +

                  ◆ Vec4AccT()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4AccT ()
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ add() [1/3]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void add (const float2 * ptr)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ add() [2/3]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void add (const float4 * ptr)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ add() [3/3]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void add (const uint8_t * ptr)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ add_() [1/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void add_ (const float * vals)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ add_() [2/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void add_ (const half2 * vals_h)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ div()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void div (uint32_t denom)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ fma() [1/3]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void fma (const float2 * ptr,
                  const float weight )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ fma() [2/3]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void fma (const float4 * ptr,
                  const float weight )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ fma() [3/3]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void fma (const uint8_t * ptr,
                  const float weight )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ fma_() [1/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void fma_ (const float * vals,
                  const float weight )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ fma_() [2/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void fma_ (const half * vals,
                  const float weight )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ reset()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void reset ()
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [1/3]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (float2 * ptr)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [2/3]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (float4 * ptr)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [3/3]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (uint8_t * ptr)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store_() [1/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store_ (const float4 * src,
                  float2 * dst )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store_() [2/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store_ (const float4 * src,
                  float4 * dst )
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Data Documentation

                  + +

                  ◆ acc

                  + +
                  +
                  + + + + +
                  float acc[4]
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec4_acc_t.png b/structfbgemm__gpu_1_1_vec4_acc_t.png new file mode 100644 index 000000000..54821f6cb Binary files /dev/null and b/structfbgemm__gpu_1_1_vec4_acc_t.png differ diff --git a/structfbgemm__gpu_1_1_vec4_step_t-members.html b/structfbgemm__gpu_1_1_vec4_step_t-members.html new file mode 100644 index 000000000..c4459fee4 --- /dev/null +++ b/structfbgemm__gpu_1_1_vec4_step_t-members.html @@ -0,0 +1,109 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  Vec4StepT< STEP, input_t > Member List
                  +
                  +
                  + +

                  This is the complete list of members for Vec4StepT< STEP, input_t >, including all inherited members.

                  + + + + + + + + + + + + + + + + + + + + +
                  accVec4AccT
                  add(const float4 *ptr)Vec4AccTinline
                  add(const float2 *ptr)Vec4AccTinline
                  add(const uint8_t *ptr)Vec4AccTinline
                  add_(const float *vals)Vec4AccTinline
                  add_(const half2 *vals_h)Vec4AccTinline
                  div(uint32_t denom)Vec4AccTinline
                  fma(const float4 *ptr, const float weight)Vec4AccTinline
                  fma(const float2 *ptr, const float weight)Vec4AccTinline
                  fma(const uint8_t *ptr, const float weight)Vec4AccTinline
                  fma_(const float *vals, const float weight)Vec4AccTinline
                  fma_(const half *vals, const float weight)Vec4AccTinline
                  reset()Vec4AccTinline
                  store(float4 *ptr)Vec4AccTinline
                  store(float2 *ptr)Vec4AccTinline
                  store(uint8_t *ptr)Vec4AccTinline
                  store_(const float4 *src, float4 *dst)Vec4AccTinline
                  store_(const float4 *src, float2 *dst)Vec4AccTinline
                  Vec4AccT()Vec4AccTinline
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec4_step_t.html b/structfbgemm__gpu_1_1_vec4_step_t.html new file mode 100644 index 000000000..7d7a43d35 --- /dev/null +++ b/structfbgemm__gpu_1_1_vec4_step_t.html @@ -0,0 +1,101 @@ + + + + + + + +fbgemm_gpu: Vec4StepT< STEP, input_t > Struct Template Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  Vec4StepT< STEP, input_t > Struct Template Reference
                  +
                  +
                  +
                  +Inheritance diagram for Vec4StepT< STEP, input_t >:
                  +
                  +
                  + + +Vec4AccT + +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec4_step_t.png b/structfbgemm__gpu_1_1_vec4_step_t.png new file mode 100644 index 000000000..86c6a7700 Binary files /dev/null and b/structfbgemm__gpu_1_1_vec4_step_t.png differ diff --git a/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4-members.html b/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4-members.html new file mode 100644 index 000000000..5579bd210 --- /dev/null +++ b/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4-members.html @@ -0,0 +1,121 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  Vec4StepT< STEP, at::Half > Member List
                  +
                  +
                  + +

                  This is the complete list of members for Vec4StepT< STEP, at::Half >, including all inherited members.

                  + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                  accVec4AccT
                  add(const float4 *ptr)Vec4AccTinline
                  add(const float2 *ptr)Vec4AccTinline
                  add(const uint8_t *ptr)Vec4AccTinline
                  add_(const float *vals)Vec4AccTinline
                  add_(const half2 *vals_h)Vec4AccTinline
                  div(uint32_t denom)Vec4AccTinline
                  fma(const float4 *ptr, const float weight)Vec4AccTinline
                  fma(const float2 *ptr, const float weight)Vec4AccTinline
                  fma(const uint8_t *ptr, const float weight)Vec4AccTinline
                  fma_(const float *vals, const float weight)Vec4AccTinline
                  fma_(const half *vals, const float weight)Vec4AccTinline
                  index_add(uint32_t idx)Vec4StepT< STEP, at::Half >inline
                  index_fma(uint32_t idx, const float weight)Vec4StepT< STEP, at::Half >inline
                  index_store(uint32_t idx, float4 *ptr)Vec4StepT< STEP, at::Half >inline
                  index_store(uint32_t idx, float2 *ptr)Vec4StepT< STEP, at::Half >inline
                  index_store(uint32_t idx, uint8_t *ptr)Vec4StepT< STEP, at::Half >inline
                  index_weighted_store(uint32_t idx, float4 *ptr, const float weight)Vec4StepT< STEP, at::Half >inline
                  index_weighted_store(uint32_t idx, float2 *ptr, const float weight)Vec4StepT< STEP, at::Half >inline
                  index_weighted_store(uint32_t idx, uint8_t *ptr, const float weight)Vec4StepT< STEP, at::Half >inline
                  load(const float2 *ptr, const uint32_t idx)Vec4StepT< STEP, at::Half >inline
                  loaded_valsVec4StepT< STEP, at::Half >
                  reset()Vec4AccTinline
                  store(float4 *ptr)Vec4AccTinline
                  store(float2 *ptr)Vec4AccTinline
                  store(uint8_t *ptr)Vec4AccTinline
                  store_(const float4 *src, float4 *dst)Vec4AccTinline
                  store_(const float4 *src, float2 *dst)Vec4AccTinline
                  sum()Vec4StepT< STEP, at::Half >inline
                  Vec4AccT()Vec4AccTinline
                  weighted_sum(const float *const weights, const uint32_t idx_shift, const uint32_t idx_scale)Vec4StepT< STEP, at::Half >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html b/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html new file mode 100644 index 000000000..9b2bb3ad6 --- /dev/null +++ b/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.html @@ -0,0 +1,472 @@ + + + + + + + +fbgemm_gpu: Vec4StepT< STEP, at::Half > Struct Template Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  Vec4StepT< STEP, at::Half > Struct Template Reference
                  +
                  +
                  +
                  +Inheritance diagram for Vec4StepT< STEP, at::Half >:
                  +
                  +
                  + + +Vec4AccT + +
                  +

                  Member Function Documentation

                  + +

                  ◆ index_add()

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void index_add (uint32_t idx)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_fma()

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void index_fma (uint32_t idx,
                  const float weight )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_store() [1/3]

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void index_store (uint32_t idx,
                  float2 * ptr )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_store() [2/3]

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void index_store (uint32_t idx,
                  float4 * ptr )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_store() [3/3]

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void index_store (uint32_t idx,
                  uint8_t * ptr )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_weighted_store() [1/3]

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void index_weighted_store (uint32_t idx,
                  float2 * ptr,
                  const float weight )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_weighted_store() [2/3]

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void index_weighted_store (uint32_t idx,
                  float4 * ptr,
                  const float weight )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_weighted_store() [3/3]

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void index_weighted_store (uint32_t idx,
                  uint8_t * ptr,
                  const float weight )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load()

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void load (const float2 * ptr,
                  const uint32_t idx )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ sum()

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void sum ()
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ weighted_sum()

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void weighted_sum (const float *const weights,
                  const uint32_t idx_shift,
                  const uint32_t idx_scale )
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Data Documentation

                  + +

                  ◆ loaded_vals

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + +
                  float2 loaded_vals[STEP]
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.png b/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.png new file mode 100644 index 000000000..cf5cbe74e Binary files /dev/null and b/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01at_1_1_half_01_4.png differ diff --git a/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4-members.html b/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4-members.html new file mode 100644 index 000000000..10caa3632 --- /dev/null +++ b/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4-members.html @@ -0,0 +1,121 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  Vec4StepT< STEP, float > Member List
                  +
                  +
                  + +

                  This is the complete list of members for Vec4StepT< STEP, float >, including all inherited members.

                  + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                  accVec4AccT
                  add(const float4 *ptr)Vec4AccTinline
                  add(const float2 *ptr)Vec4AccTinline
                  add(const uint8_t *ptr)Vec4AccTinline
                  add_(const float *vals)Vec4AccTinline
                  add_(const half2 *vals_h)Vec4AccTinline
                  div(uint32_t denom)Vec4AccTinline
                  fma(const float4 *ptr, const float weight)Vec4AccTinline
                  fma(const float2 *ptr, const float weight)Vec4AccTinline
                  fma(const uint8_t *ptr, const float weight)Vec4AccTinline
                  fma_(const float *vals, const float weight)Vec4AccTinline
                  fma_(const half *vals, const float weight)Vec4AccTinline
                  index_add(uint32_t idx)Vec4StepT< STEP, float >inline
                  index_fma(uint32_t idx, const float weight)Vec4StepT< STEP, float >inline
                  index_store(uint32_t idx, float4 *ptr)Vec4StepT< STEP, float >inline
                  index_store(uint32_t idx, float2 *ptr)Vec4StepT< STEP, float >inline
                  index_store(uint32_t idx, uint8_t *ptr)Vec4StepT< STEP, float >inline
                  index_weighted_store(uint32_t idx, float4 *ptr, const float weight)Vec4StepT< STEP, float >inline
                  index_weighted_store(uint32_t idx, float2 *ptr, const float weight)Vec4StepT< STEP, float >inline
                  index_weighted_store(uint32_t idx, uint8_t *ptr, const float weight)Vec4StepT< STEP, float >inline
                  load(const float4 *ptr, const uint32_t idx)Vec4StepT< STEP, float >inline
                  loaded_valsVec4StepT< STEP, float >
                  reset()Vec4AccTinline
                  store(float4 *ptr)Vec4AccTinline
                  store(float2 *ptr)Vec4AccTinline
                  store(uint8_t *ptr)Vec4AccTinline
                  store_(const float4 *src, float4 *dst)Vec4AccTinline
                  store_(const float4 *src, float2 *dst)Vec4AccTinline
                  sum()Vec4StepT< STEP, float >inline
                  Vec4AccT()Vec4AccTinline
                  weighted_sum(const float *const weights, const uint32_t idx_shift, const uint32_t idx_scale)Vec4StepT< STEP, float >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html b/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html new file mode 100644 index 000000000..086c951af --- /dev/null +++ b/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.html @@ -0,0 +1,472 @@ + + + + + + + +fbgemm_gpu: Vec4StepT< STEP, float > Struct Template Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  Vec4StepT< STEP, float > Struct Template Reference
                  +
                  +
                  +
                  +Inheritance diagram for Vec4StepT< STEP, float >:
                  +
                  +
                  + + +Vec4AccT + +
                  +

                  Member Function Documentation

                  + +

                  ◆ index_add()

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void index_add (uint32_t idx)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_fma()

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void index_fma (uint32_t idx,
                  const float weight )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_store() [1/3]

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void index_store (uint32_t idx,
                  float2 * ptr )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_store() [2/3]

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void index_store (uint32_t idx,
                  float4 * ptr )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_store() [3/3]

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void index_store (uint32_t idx,
                  uint8_t * ptr )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_weighted_store() [1/3]

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void index_weighted_store (uint32_t idx,
                  float2 * ptr,
                  const float weight )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_weighted_store() [2/3]

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void index_weighted_store (uint32_t idx,
                  float4 * ptr,
                  const float weight )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_weighted_store() [3/3]

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void index_weighted_store (uint32_t idx,
                  uint8_t * ptr,
                  const float weight )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load()

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void load (const float4 * ptr,
                  const uint32_t idx )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ sum()

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void sum ()
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ weighted_sum()

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void weighted_sum (const float *const weights,
                  const uint32_t idx_shift,
                  const uint32_t idx_scale )
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Data Documentation

                  + +

                  ◆ loaded_vals

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + +
                  float4 loaded_vals[STEP]
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.png b/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.png new file mode 100644 index 000000000..4c36c8706 Binary files /dev/null and b/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01float_01_4.png differ diff --git a/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4-members.html b/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4-members.html new file mode 100644 index 000000000..ca1782fbc --- /dev/null +++ b/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4-members.html @@ -0,0 +1,121 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  Vec4StepT< STEP, uint8_t > Member List
                  +
                  +
                  + +

                  This is the complete list of members for Vec4StepT< STEP, uint8_t >, including all inherited members.

                  + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                  accVec4AccT
                  add(const float4 *ptr)Vec4AccTinline
                  add(const float2 *ptr)Vec4AccTinline
                  add(const uint8_t *ptr)Vec4AccTinline
                  add_(const float *vals)Vec4AccTinline
                  add_(const half2 *vals_h)Vec4AccTinline
                  div(uint32_t denom)Vec4AccTinline
                  fma(const float4 *ptr, const float weight)Vec4AccTinline
                  fma(const float2 *ptr, const float weight)Vec4AccTinline
                  fma(const uint8_t *ptr, const float weight)Vec4AccTinline
                  fma_(const float *vals, const float weight)Vec4AccTinline
                  fma_(const half *vals, const float weight)Vec4AccTinline
                  index_add(uint32_t idx)Vec4StepT< STEP, uint8_t >inline
                  index_fma(uint32_t idx, const float weight)Vec4StepT< STEP, uint8_t >inline
                  index_store(uint32_t idx, float4 *ptr)Vec4StepT< STEP, uint8_t >inline
                  index_store(uint32_t idx, float2 *ptr)Vec4StepT< STEP, uint8_t >inline
                  index_store(uint32_t idx, uint8_t *ptr)Vec4StepT< STEP, uint8_t >inline
                  index_weighted_store(uint32_t idx, float4 *ptr, const float weight)Vec4StepT< STEP, uint8_t >inline
                  index_weighted_store(uint32_t idx, float2 *ptr, const float weight)Vec4StepT< STEP, uint8_t >inline
                  index_weighted_store(uint32_t idx, uint8_t *ptr, const float weight)Vec4StepT< STEP, uint8_t >inline
                  load(const uint8_t *ptr, const uint32_t idx)Vec4StepT< STEP, uint8_t >inline
                  reset()Vec4AccTinline
                  store(float4 *ptr)Vec4AccTinline
                  store(float2 *ptr)Vec4AccTinline
                  store(uint8_t *ptr)Vec4AccTinline
                  store_(const float4 *src, float4 *dst)Vec4AccTinline
                  store_(const float4 *src, float2 *dst)Vec4AccTinline
                  sum()Vec4StepT< STEP, uint8_t >inline
                  Vec4AccT()Vec4AccTinline
                  Vec4StepT()Vec4StepT< STEP, uint8_t >inline
                  weighted_sum(const float *const weights, const uint32_t idx_shift, const uint32_t idx_scale)Vec4StepT< STEP, uint8_t >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html b/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html new file mode 100644 index 000000000..884ef1daa --- /dev/null +++ b/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.html @@ -0,0 +1,483 @@ + + + + + + + +fbgemm_gpu: Vec4StepT< STEP, uint8_t > Struct Template Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  Vec4StepT< STEP, uint8_t > Struct Template Reference
                  +
                  +
                  +
                  +Inheritance diagram for Vec4StepT< STEP, uint8_t >:
                  +
                  +
                  + + +Vec4AccT + +
                  +

                  Constructor & Destructor Documentation

                  + +

                  ◆ Vec4StepT()

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4StepT ()
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ index_add()

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void index_add (uint32_t idx)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_fma()

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void index_fma (uint32_t idx,
                  const float weight )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_store() [1/3]

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void index_store (uint32_t idx,
                  float2 * ptr )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_store() [2/3]

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void index_store (uint32_t idx,
                  float4 * ptr )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_store() [3/3]

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void index_store (uint32_t idx,
                  uint8_t * ptr )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_weighted_store() [1/3]

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void index_weighted_store (uint32_t idx,
                  float2 * ptr,
                  const float weight )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_weighted_store() [2/3]

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void index_weighted_store (uint32_t idx,
                  float4 * ptr,
                  const float weight )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ index_weighted_store() [3/3]

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void index_weighted_store (uint32_t idx,
                  uint8_t * ptr,
                  const float weight )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load()

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void load (const uint8_t * ptr,
                  const uint32_t idx )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ sum()

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void sum ()
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ weighted_sum()

                  + +
                  +
                  +
                  +template<uint32_t STEP>
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void weighted_sum (const float *const weights,
                  const uint32_t idx_shift,
                  const uint32_t idx_scale )
                  +
                  +inline
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.png b/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.png new file mode 100644 index 000000000..fcf71b1f1 Binary files /dev/null and b/structfbgemm__gpu_1_1_vec4_step_t_3_01_s_t_e_p_00_01uint8__t_01_4.png differ diff --git a/structfbgemm__gpu_1_1_vec4_t.html b/structfbgemm__gpu_1_1_vec4_t.html new file mode 100644 index 000000000..c2afba3ca --- /dev/null +++ b/structfbgemm__gpu_1_1_vec4_t.html @@ -0,0 +1,90 @@ + + + + + + + +fbgemm_gpu: Vec4T< T > Struct Template Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  Vec4T< T > Struct Template Reference
                  +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4-members.html b/structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4-members.html new file mode 100644 index 000000000..2249b179e --- /dev/null +++ b/structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4-members.html @@ -0,0 +1,114 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  Vec4T< at::BFloat16 > Member List
                  +
                  +
                  + +

                  This is the complete list of members for Vec4T< at::BFloat16 >, including all inherited members.

                  + + + + + + + + + + + + + + + + + + + + + + + + + +
                  accVec4T< at::BFloat16 >
                  add_(const Vec4T< float > &a)Vec4T< at::BFloat16 >inline
                  add_(const Vec4T< at::Half > &a)Vec4T< at::BFloat16 >inline
                  copy(const at::BFloat16 *src, at::BFloat16 *dst)Vec4T< at::BFloat16 >inlinestatic
                  element_wise_mul_(const Vec4T< float > &a)Vec4T< at::BFloat16 >inline
                  element_wise_mul_(const Vec4T< at::Half > &a)Vec4T< at::BFloat16 >inline
                  fma_(const Vec4T< at::Half > &a, const float b)Vec4T< at::BFloat16 >inline
                  fma_(const Vec4T< float > &a, const float b)Vec4T< at::BFloat16 >inline
                  load(const at::BFloat16 *p)Vec4T< at::BFloat16 >inline
                  load(const at::Half *p)Vec4T< at::BFloat16 >inline
                  load(const float *p)Vec4T< at::BFloat16 >inline
                  load(const double *p)Vec4T< at::BFloat16 >inline
                  load(const uint8_t *p)Vec4T< at::BFloat16 >inline
                  mul_(float scale)Vec4T< at::BFloat16 >inline
                  store(at::Half *p) constVec4T< at::BFloat16 >inline
                  store(at::BFloat16 *p) constVec4T< at::BFloat16 >inline
                  store(float *p) constVec4T< at::BFloat16 >inline
                  store(double *p) constVec4T< at::BFloat16 >inline
                  store(uint8_t *p) constVec4T< at::BFloat16 >inline
                  Vec4T()Vec4T< at::BFloat16 >inline
                  Vec4T(const at::BFloat16 *p)Vec4T< at::BFloat16 >inline
                  Vec4T(const at::Half *p)Vec4T< at::BFloat16 >inline
                  Vec4T(const float *p)Vec4T< at::BFloat16 >inline
                  Vec4T(const double *p)Vec4T< at::BFloat16 >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html b/structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html new file mode 100644 index 000000000..80fc29153 --- /dev/null +++ b/structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_b_float16_01_4.html @@ -0,0 +1,696 @@ + + + + + + + +fbgemm_gpu: Vec4T< at::BFloat16 > Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  Vec4T< at::BFloat16 > Struct Reference
                  +
                  +
                  +

                  Constructor & Destructor Documentation

                  + +

                  ◆ Vec4T() [1/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T ()
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ Vec4T() [2/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T (const at::BFloat16 * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ Vec4T() [3/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T (const at::Half * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ Vec4T() [4/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T (const float * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ Vec4T() [5/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T (const double * p)
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ add_() [1/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void add_ (const Vec4T< at::Half > & a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ add_() [2/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void add_ (const Vec4T< float > & a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ copy()

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  static DEVICE_INLINE void copy (const at::BFloat16 * src,
                  at::BFloat16 * dst )
                  +
                  +inlinestatic
                  +
                  + +
                  +
                  + +

                  ◆ element_wise_mul_() [1/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void element_wise_mul_ (const Vec4T< at::Half > & a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ element_wise_mul_() [2/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void element_wise_mul_ (const Vec4T< float > & a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ fma_() [1/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void fma_ (const Vec4T< at::Half > & a,
                  const float b )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ fma_() [2/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void fma_ (const Vec4T< float > & a,
                  const float b )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [1/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const at::BFloat16 * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [2/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const at::Half * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [3/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const double * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [4/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const float * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [5/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const uint8_t * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ mul_()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void mul_ (float scale)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [1/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (at::BFloat16 * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [2/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (at::Half * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [3/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (double * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [4/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (float * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [5/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (uint8_t * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Data Documentation

                  + +

                  ◆ acc

                  + +
                  +
                  + + + + +
                  float4 acc
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4-members.html b/structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4-members.html new file mode 100644 index 000000000..ba166c2a6 --- /dev/null +++ b/structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4-members.html @@ -0,0 +1,114 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  Vec4T< at::Half > Member List
                  +
                  +
                  + +

                  This is the complete list of members for Vec4T< at::Half >, including all inherited members.

                  + + + + + + + + + + + + + + + + + + + + + + + + + +
                  accVec4T< at::Half >
                  add_(const Vec4T< float > &a)Vec4T< at::Half >inline
                  add_(const Vec4T< at::Half > &a)Vec4T< at::Half >inline
                  copy(const at::Half *src, at::Half *dst)Vec4T< at::Half >inlinestatic
                  element_wise_mul_(const Vec4T< float > &a)Vec4T< at::Half >inline
                  element_wise_mul_(const Vec4T< at::Half > &a)Vec4T< at::Half >inline
                  fma_(const Vec4T< at::Half > &a, const float b)Vec4T< at::Half >inline
                  fma_(const Vec4T< float > &a, const float b)Vec4T< at::Half >inline
                  load(const at::Half *p)Vec4T< at::Half >inline
                  load(const at::BFloat16 *p)Vec4T< at::Half >inline
                  load(const float *p)Vec4T< at::Half >inline
                  load(const double *p)Vec4T< at::Half >inline
                  load(const uint8_t *p)Vec4T< at::Half >inline
                  mul_(float scale)Vec4T< at::Half >inline
                  store(at::Half *p) constVec4T< at::Half >inline
                  store(at::BFloat16 *p) constVec4T< at::Half >inline
                  store(float *p) constVec4T< at::Half >inline
                  store(double *p) constVec4T< at::Half >inline
                  store(uint8_t *p) constVec4T< at::Half >inline
                  Vec4T()Vec4T< at::Half >inline
                  Vec4T(const at::Half *p)Vec4T< at::Half >inline
                  Vec4T(const at::BFloat16 *p)Vec4T< at::Half >inline
                  Vec4T(const float *p)Vec4T< at::Half >inline
                  Vec4T(const double *p)Vec4T< at::Half >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html b/structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html new file mode 100644 index 000000000..43c08b7f2 --- /dev/null +++ b/structfbgemm__gpu_1_1_vec4_t_3_01at_1_1_half_01_4.html @@ -0,0 +1,696 @@ + + + + + + + +fbgemm_gpu: Vec4T< at::Half > Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  Vec4T< at::Half > Struct Reference
                  +
                  +
                  +

                  Constructor & Destructor Documentation

                  + +

                  ◆ Vec4T() [1/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T ()
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ Vec4T() [2/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T (const at::Half * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ Vec4T() [3/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T (const at::BFloat16 * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ Vec4T() [4/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T (const float * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ Vec4T() [5/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T (const double * p)
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ add_() [1/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void add_ (const Vec4T< at::Half > & a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ add_() [2/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void add_ (const Vec4T< float > & a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ copy()

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  static DEVICE_INLINE void copy (const at::Half * src,
                  at::Half * dst )
                  +
                  +inlinestatic
                  +
                  + +
                  +
                  + +

                  ◆ element_wise_mul_() [1/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void element_wise_mul_ (const Vec4T< at::Half > & a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ element_wise_mul_() [2/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void element_wise_mul_ (const Vec4T< float > & a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ fma_() [1/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void fma_ (const Vec4T< at::Half > & a,
                  const float b )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ fma_() [2/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void fma_ (const Vec4T< float > & a,
                  const float b )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [1/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const at::BFloat16 * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [2/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const at::Half * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [3/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const double * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [4/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const float * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [5/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const uint8_t * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ mul_()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void mul_ (float scale)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [1/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (at::BFloat16 * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [2/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (at::Half * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [3/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (double * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [4/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (float * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [5/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (uint8_t * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Data Documentation

                  + +

                  ◆ acc

                  + +
                  +
                  + + + + +
                  float4 acc
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec4_t_3_01double_01_4-members.html b/structfbgemm__gpu_1_1_vec4_t_3_01double_01_4-members.html new file mode 100644 index 000000000..a0fee38dd --- /dev/null +++ b/structfbgemm__gpu_1_1_vec4_t_3_01double_01_4-members.html @@ -0,0 +1,110 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  Vec4T< double > Member List
                  +
                  +
                  + +

                  This is the complete list of members for Vec4T< double >, including all inherited members.

                  + + + + + + + + + + + + + + + + + + + + + +
                  accVec4T< double >
                  add_(const Vec4T< double > &a)Vec4T< double >inline
                  copy(const double *src, double *dst)Vec4T< double >inlinestatic
                  element_wise_mul_(const Vec4T< double > &a)Vec4T< double >inline
                  fma_(const Vec4T< double > &a, const double b)Vec4T< double >inline
                  load(const at::Half *p)Vec4T< double >inline
                  load(const at::BFloat16 *p)Vec4T< double >inline
                  load(const float *p)Vec4T< double >inline
                  load(const uint8_t *p)Vec4T< double >inline
                  load(const double *p)Vec4T< double >inline
                  mul_(float scale)Vec4T< double >inline
                  store(double *p) constVec4T< double >inline
                  store(float *p) constVec4T< double >inline
                  store(at::Half *p) constVec4T< double >inline
                  store(at::BFloat16 *p) constVec4T< double >inline
                  Vec4T()Vec4T< double >inline
                  Vec4T(const at::Half *p)Vec4T< double >inline
                  Vec4T(const at::BFloat16 *p)Vec4T< double >inline
                  Vec4T(const float *p)Vec4T< double >inline
                  Vec4T(const double *p)Vec4T< double >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html b/structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html new file mode 100644 index 000000000..b46910f40 --- /dev/null +++ b/structfbgemm__gpu_1_1_vec4_t_3_01double_01_4.html @@ -0,0 +1,592 @@ + + + + + + + +fbgemm_gpu: Vec4T< double > Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  Vec4T< double > Struct Reference
                  +
                  +
                  +

                  Constructor & Destructor Documentation

                  + +

                  ◆ Vec4T() [1/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T ()
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ Vec4T() [2/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T (const at::Half * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ Vec4T() [3/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T (const at::BFloat16 * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ Vec4T() [4/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T (const float * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ Vec4T() [5/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T (const double * p)
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ add_()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void add_ (const Vec4T< double > & a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ copy()

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  static DEVICE_INLINE void copy (const double * src,
                  double * dst )
                  +
                  +inlinestatic
                  +
                  + +
                  +
                  + +

                  ◆ element_wise_mul_()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void element_wise_mul_ (const Vec4T< double > & a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ fma_()

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void fma_ (const Vec4T< double > & a,
                  const double b )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [1/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const at::BFloat16 * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [2/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const at::Half * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [3/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const double * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [4/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const float * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [5/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const uint8_t * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ mul_()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void mul_ (float scale)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [1/4]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (at::BFloat16 * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [2/4]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (at::Half * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [3/4]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (double * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [4/4]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (float * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Data Documentation

                  + +

                  ◆ acc

                  + +
                  +
                  + + + + +
                  double4 acc
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec4_t_3_01float_01_4-members.html b/structfbgemm__gpu_1_1_vec4_t_3_01float_01_4-members.html new file mode 100644 index 000000000..ea78fb9be --- /dev/null +++ b/structfbgemm__gpu_1_1_vec4_t_3_01float_01_4-members.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  Vec4T< float > Member List
                  +
                  +
                  + +

                  This is the complete list of members for Vec4T< float >, including all inherited members.

                  + + + + + + + + + + + + + + + + + + + + + + + +
                  accVec4T< float >
                  add_(const Vec4T< float > &a)Vec4T< float >inline
                  copy(const float *src, float *dst)Vec4T< float >inlinestatic
                  element_wise_mul_(const Vec4T< float > &a)Vec4T< float >inline
                  fma_(const Vec4T< float > &a, const float b)Vec4T< float >inline
                  load(const float *p)Vec4T< float >inline
                  load(const double *p)Vec4T< float >inline
                  load(const at::Half *p)Vec4T< float >inline
                  load(const at::BFloat16 *p)Vec4T< float >inline
                  load(const uint8_t *p)Vec4T< float >inline
                  mul_(float scale)Vec4T< float >inline
                  store(float *p) constVec4T< float >inline
                  store(float4 *p) constVec4T< float >inline
                  store(at::Half *p) constVec4T< float >inline
                  store(at::BFloat16 *p) constVec4T< float >inline
                  store(double *p) constVec4T< float >inline
                  store(uint8_t *p) constVec4T< float >inline
                  Vec4T()Vec4T< float >inline
                  Vec4T(const float *p)Vec4T< float >inline
                  Vec4T(const double *p)Vec4T< float >inline
                  Vec4T(const at::Half *p)Vec4T< float >inline
                  Vec4T(const at::BFloat16 *p)Vec4T< float >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html b/structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html new file mode 100644 index 000000000..d00d48a11 --- /dev/null +++ b/structfbgemm__gpu_1_1_vec4_t_3_01float_01_4.html @@ -0,0 +1,642 @@ + + + + + + + +fbgemm_gpu: Vec4T< float > Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  Vec4T< float > Struct Reference
                  +
                  +
                  +

                  Constructor & Destructor Documentation

                  + +

                  ◆ Vec4T() [1/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T ()
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ Vec4T() [2/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T (const float * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ Vec4T() [3/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T (const double * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ Vec4T() [4/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T (const at::Half * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ Vec4T() [5/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE Vec4T (const at::BFloat16 * p)
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ add_()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void add_ (const Vec4T< float > & a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ copy()

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  static DEVICE_INLINE void copy (const float * src,
                  float * dst )
                  +
                  +inlinestatic
                  +
                  + +
                  +
                  + +

                  ◆ element_wise_mul_()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void element_wise_mul_ (const Vec4T< float > & a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ fma_()

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void fma_ (const Vec4T< float > & a,
                  const float b )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [1/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const at::BFloat16 * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [2/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const at::Half * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [3/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const double * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [4/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const float * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load() [5/5]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void load (const uint8_t * p)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ mul_()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void mul_ (float scale)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [1/6]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (at::BFloat16 * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [2/6]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (at::Half * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [3/6]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (double * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [4/6]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (float * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [5/6]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (float4 * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [6/6]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store (uint8_t * p) const
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Data Documentation

                  + +

                  ◆ acc

                  + +
                  +
                  + + + + +
                  float4 acc
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec_n_t.html b/structfbgemm__gpu_1_1_vec_n_t.html new file mode 100644 index 000000000..bd7d1329f --- /dev/null +++ b/structfbgemm__gpu_1_1_vec_n_t.html @@ -0,0 +1,90 @@ + + + + + + + +fbgemm_gpu: VecNT< N, PrimitiveType > Struct Template Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  VecNT< N, PrimitiveType > Struct Template Reference
                  +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4-members.html b/structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4-members.html new file mode 100644 index 000000000..ed0747d60 --- /dev/null +++ b/structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4-members.html @@ -0,0 +1,104 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  VecNT< 16, PrimitiveType::INT > Member List
                  +
                  +
                  + +

                  This is the complete list of members for VecNT< 16, PrimitiveType::INT >, including all inherited members.

                  + + + + + + + + + + + + + + + +
                  accVecNT< 16, PrimitiveType::INT >
                  add(uint32_t v, half2 shift_scale)VecNT< 16, PrimitiveType::INT >inline
                  fma(uint32_t v, half2 shift_scale, float b)VecNT< 16, PrimitiveType::INT >inline
                  mul(float a)VecNT< 16, PrimitiveType::INT >inline
                  store(float *output_ptr, int num_valid_outputs=16)VecNT< 16, PrimitiveType::INT >inline
                  store(at::Half *output_ptr, int num_valid_outputs=16)VecNT< 16, PrimitiveType::INT >inline
                  store(at::BFloat16 *output_ptr, const int num_valid_outputs=16)VecNT< 16, PrimitiveType::INT >inline
                  store(uint8_t *output_ptr, int num_valid_outputs=16)VecNT< 16, PrimitiveType::INT >inline
                  store(uint8_t *output_ptr, float2 qparams, int num_valid_outputs=16)VecNT< 16, PrimitiveType::INT >inline
                  store(float *output_ptr, float2 qparams, int num_valid_outputs=16)VecNT< 16, PrimitiveType::INT >inline
                  store(at::Half *output_ptr, float2 qparams, int num_valid_outputs=16)VecNT< 16, PrimitiveType::INT >inline
                  store(at::BFloat16 *output_ptr, float2 qparams, int num_valid_outputs=16)VecNT< 16, PrimitiveType::INT >inline
                  VecNT()VecNT< 16, PrimitiveType::INT >inline
                  VecNT(uint32_t v, half2 shift_scale)VecNT< 16, PrimitiveType::INT >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html b/structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html new file mode 100644 index 000000000..5c87a2896 --- /dev/null +++ b/structfbgemm__gpu_1_1_vec_n_t_3_0116_00_01_primitive_type_1_1_i_n_t_01_4.html @@ -0,0 +1,503 @@ + + + + + + + +fbgemm_gpu: VecNT< 16, PrimitiveType::INT > Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  VecNT< 16, PrimitiveType::INT > Struct Reference
                  +
                  +
                  +

                  Constructor & Destructor Documentation

                  + +

                  ◆ VecNT() [1/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE VecNT ()
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ VecNT() [2/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE VecNT (uint32_t v,
                  half2 shift_scale )
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ add()

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void add (uint32_t v,
                  half2 shift_scale )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ fma()

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void fma (uint32_t v,
                  half2 shift_scale,
                  float b )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ mul()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void mul (float a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [1/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::BFloat16 * output_ptr,
                  const int num_valid_outputs = 16 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [2/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::BFloat16 * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 16 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [3/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::Half * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 16 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [4/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::Half * output_ptr,
                  int num_valid_outputs = 16 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [5/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (float * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 16 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [6/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (float * output_ptr,
                  int num_valid_outputs = 16 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [7/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (uint8_t * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 16 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [8/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (uint8_t * output_ptr,
                  int num_valid_outputs = 16 )
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Data Documentation

                  + +

                  ◆ acc

                  + +
                  +
                  + + + + +
                  float_16 acc
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4-members.html b/structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4-members.html new file mode 100644 index 000000000..be1046977 --- /dev/null +++ b/structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4-members.html @@ -0,0 +1,104 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  VecNT< 1, PrimitiveType::FP > Member List
                  +
                  +
                  + +

                  This is the complete list of members for VecNT< 1, PrimitiveType::FP >, including all inherited members.

                  + + + + + + + + + + + + + + + +
                  accVecNT< 1, PrimitiveType::FP >
                  add(float a)VecNT< 1, PrimitiveType::FP >inline
                  fma(float a, float b)VecNT< 1, PrimitiveType::FP >inline
                  mul(float a)VecNT< 1, PrimitiveType::FP >inline
                  store(float *output_ptr, int num_valid_outputs=1)VecNT< 1, PrimitiveType::FP >inline
                  store(at::Half *output_ptr, int num_valid_outputs=1)VecNT< 1, PrimitiveType::FP >inline
                  store(at::BFloat16 *output_ptr, const int num_valid_outputs=1)VecNT< 1, PrimitiveType::FP >inline
                  store(uint8_t *output_ptr, int num_valid_outputs=1)VecNT< 1, PrimitiveType::FP >inline
                  store(uint8_t *output_ptr, float2 qparams, int num_valid_outputs=1)VecNT< 1, PrimitiveType::FP >inline
                  store(float *output_ptr, float2 qparams, int num_valid_outputs=1)VecNT< 1, PrimitiveType::FP >inline
                  store(at::Half *output_ptr, float2 qparams, int num_valid_outputs=1)VecNT< 1, PrimitiveType::FP >inline
                  store(at::BFloat16 *output_ptr, float2 qparams, int num_valid_outputs=1)VecNT< 1, PrimitiveType::FP >inline
                  VecNT()VecNT< 1, PrimitiveType::FP >inline
                  VecNT(float a)VecNT< 1, PrimitiveType::FP >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html b/structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html new file mode 100644 index 000000000..ff5f1cfaf --- /dev/null +++ b/structfbgemm__gpu_1_1_vec_n_t_3_011_00_01_primitive_type_1_1_f_p_01_4.html @@ -0,0 +1,490 @@ + + + + + + + +fbgemm_gpu: VecNT< 1, PrimitiveType::FP > Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  VecNT< 1, PrimitiveType::FP > Struct Reference
                  +
                  +
                  +

                  Constructor & Destructor Documentation

                  + +

                  ◆ VecNT() [1/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE VecNT ()
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ VecNT() [2/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE VecNT (float a)
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ add()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void add (float a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ fma()

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void fma (float a,
                  float b )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ mul()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void mul (float a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [1/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::BFloat16 * output_ptr,
                  const int num_valid_outputs = 1 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [2/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::BFloat16 * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 1 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [3/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::Half * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 1 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [4/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::Half * output_ptr,
                  int num_valid_outputs = 1 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [5/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (float * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 1 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [6/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (float * output_ptr,
                  int num_valid_outputs = 1 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [7/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (uint8_t * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 1 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [8/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (uint8_t * output_ptr,
                  int num_valid_outputs = 1 )
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Data Documentation

                  + +

                  ◆ acc

                  + +
                  +
                  + + + + +
                  float acc
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4-members.html b/structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4-members.html new file mode 100644 index 000000000..40170c94f --- /dev/null +++ b/structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4-members.html @@ -0,0 +1,104 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  VecNT< 2, PrimitiveType::FP > Member List
                  +
                  +
                  + +

                  This is the complete list of members for VecNT< 2, PrimitiveType::FP >, including all inherited members.

                  + + + + + + + + + + + + + + + +
                  accVecNT< 2, PrimitiveType::FP >
                  add(half2 a)VecNT< 2, PrimitiveType::FP >inline
                  fma(half2 a, float b)VecNT< 2, PrimitiveType::FP >inline
                  mul(float a)VecNT< 2, PrimitiveType::FP >inline
                  store(float *output_ptr, int num_valid_outputs=2)VecNT< 2, PrimitiveType::FP >inline
                  store(at::Half *output_ptr, int num_valid_outputs=2)VecNT< 2, PrimitiveType::FP >inline
                  store(at::BFloat16 *output_ptr, const int num_valid_outputs=2)VecNT< 2, PrimitiveType::FP >inline
                  store(uint8_t *output_ptr, int num_valid_outputs=2)VecNT< 2, PrimitiveType::FP >inline
                  store(uint8_t *output_ptr, float2 qparams, int num_valid_outputs=2)VecNT< 2, PrimitiveType::FP >inline
                  store(float *output_ptr, float2 qparams, int num_valid_outputs=2)VecNT< 2, PrimitiveType::FP >inline
                  store(at::Half *output_ptr, float2 qparams, int num_valid_outputs=2)VecNT< 2, PrimitiveType::FP >inline
                  store(at::BFloat16 *output_ptr, float2 qparams, int num_valid_outputs=2)VecNT< 2, PrimitiveType::FP >inline
                  VecNT()VecNT< 2, PrimitiveType::FP >inline
                  VecNT(half2 a)VecNT< 2, PrimitiveType::FP >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html b/structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html new file mode 100644 index 000000000..47fc5a0d2 --- /dev/null +++ b/structfbgemm__gpu_1_1_vec_n_t_3_012_00_01_primitive_type_1_1_f_p_01_4.html @@ -0,0 +1,490 @@ + + + + + + + +fbgemm_gpu: VecNT< 2, PrimitiveType::FP > Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  VecNT< 2, PrimitiveType::FP > Struct Reference
                  +
                  +
                  +

                  Constructor & Destructor Documentation

                  + +

                  ◆ VecNT() [1/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE VecNT ()
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ VecNT() [2/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE VecNT (half2 a)
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ add()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void add (half2 a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ fma()

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void fma (half2 a,
                  float b )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ mul()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void mul (float a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [1/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::BFloat16 * output_ptr,
                  const int num_valid_outputs = 2 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [2/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::BFloat16 * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 2 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [3/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::Half * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 2 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [4/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::Half * output_ptr,
                  int num_valid_outputs = 2 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [5/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (float * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 2 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [6/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (float * output_ptr,
                  int num_valid_outputs = 2 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [7/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (uint8_t * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 2 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [8/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (uint8_t * output_ptr,
                  int num_valid_outputs = 2 )
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Data Documentation

                  + +

                  ◆ acc

                  + +
                  +
                  + + + + +
                  float2 acc
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4-members.html b/structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4-members.html new file mode 100644 index 000000000..de9730152 --- /dev/null +++ b/structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4-members.html @@ -0,0 +1,104 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  VecNT< 4, PrimitiveType::FP > Member List
                  +
                  +
                  + +

                  This is the complete list of members for VecNT< 4, PrimitiveType::FP >, including all inherited members.

                  + + + + + + + + + + + + + + + +
                  accVecNT< 4, PrimitiveType::FP >
                  add(uint32_t v, int exp_bits, int exp_bias)VecNT< 4, PrimitiveType::FP >inline
                  fma(uint32_t v, int exp_bits, int exp_bias, float b)VecNT< 4, PrimitiveType::FP >inline
                  mul(float a)VecNT< 4, PrimitiveType::FP >inline
                  store(float *output_ptr, int num_valid_outputs=4)VecNT< 4, PrimitiveType::FP >inline
                  store(at::Half *output_ptr, int num_valid_outputs=4)VecNT< 4, PrimitiveType::FP >inline
                  store(at::BFloat16 *output_ptr, const int num_valid_outputs=4)VecNT< 4, PrimitiveType::FP >inline
                  store(uint8_t *output_ptr, int num_valid_outputs=4)VecNT< 4, PrimitiveType::FP >inline
                  store(uint8_t *output_ptr, float2 qparams, int num_valid_outputs=4)VecNT< 4, PrimitiveType::FP >inline
                  store(float *output_ptr, float2 qparams, int num_valid_outputs=4)VecNT< 4, PrimitiveType::FP >inline
                  store(at::Half *output_ptr, float2 qparams, int num_valid_outputs=4)VecNT< 4, PrimitiveType::FP >inline
                  store(at::BFloat16 *output_ptr, float2 qparams, int num_valid_outputs=4)VecNT< 4, PrimitiveType::FP >inline
                  VecNT()VecNT< 4, PrimitiveType::FP >inline
                  VecNT(uint32_t v, const int exp_bits, const int exp_bias)VecNT< 4, PrimitiveType::FP >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html b/structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html new file mode 100644 index 000000000..ff1a224f5 --- /dev/null +++ b/structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_f_p_01_4.html @@ -0,0 +1,518 @@ + + + + + + + +fbgemm_gpu: VecNT< 4, PrimitiveType::FP > Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  VecNT< 4, PrimitiveType::FP > Struct Reference
                  +
                  +
                  +

                  Constructor & Destructor Documentation

                  + +

                  ◆ VecNT() [1/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE VecNT ()
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ VecNT() [2/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE VecNT (uint32_t v,
                  const int exp_bits,
                  const int exp_bias )
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ add()

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void add (uint32_t v,
                  int exp_bits,
                  int exp_bias )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ fma()

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void fma (uint32_t v,
                  int exp_bits,
                  int exp_bias,
                  float b )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ mul()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void mul (float a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [1/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::BFloat16 * output_ptr,
                  const int num_valid_outputs = 4 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [2/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::BFloat16 * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 4 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [3/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::Half * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 4 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [4/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::Half * output_ptr,
                  int num_valid_outputs = 4 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [5/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (float * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 4 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [6/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (float * output_ptr,
                  int num_valid_outputs = 4 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [7/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (uint8_t * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 4 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [8/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (uint8_t * output_ptr,
                  int num_valid_outputs = 4 )
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Data Documentation

                  + +

                  ◆ acc

                  + +
                  +
                  + + + + +
                  float4 acc
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4-members.html b/structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4-members.html new file mode 100644 index 000000000..6598ea5d9 --- /dev/null +++ b/structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4-members.html @@ -0,0 +1,104 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  VecNT< 4, PrimitiveType::INT > Member List
                  +
                  +
                  + +

                  This is the complete list of members for VecNT< 4, PrimitiveType::INT >, including all inherited members.

                  + + + + + + + + + + + + + + + +
                  accVecNT< 4, PrimitiveType::INT >
                  add(uint32_t v, half2 shift_scale)VecNT< 4, PrimitiveType::INT >inline
                  fma(uint32_t v, half2 shift_scale, float b)VecNT< 4, PrimitiveType::INT >inline
                  mul(float a)VecNT< 4, PrimitiveType::INT >inline
                  store(float *output_ptr, int num_valid_outputs=4)VecNT< 4, PrimitiveType::INT >inline
                  store(at::Half *output_ptr, int num_valid_outputs=4)VecNT< 4, PrimitiveType::INT >inline
                  store(at::BFloat16 *output_ptr, const int num_valid_outputs=4)VecNT< 4, PrimitiveType::INT >inline
                  store(uint8_t *output_ptr, int num_valid_outputs=4)VecNT< 4, PrimitiveType::INT >inline
                  store(uint8_t *output_ptr, float2 qparams, int num_valid_outputs=4)VecNT< 4, PrimitiveType::INT >inline
                  store(float *output_ptr, float2 qparams, int num_valid_outputs=4)VecNT< 4, PrimitiveType::INT >inline
                  store(at::Half *output_ptr, float2 qparams, int num_valid_outputs=4)VecNT< 4, PrimitiveType::INT >inline
                  store(at::BFloat16 *output_ptr, float2 qparams, int num_valid_outputs=4)VecNT< 4, PrimitiveType::INT >inline
                  VecNT()VecNT< 4, PrimitiveType::INT >inline
                  VecNT(uint32_t v, half2 shift_scale)VecNT< 4, PrimitiveType::INT >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html b/structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html new file mode 100644 index 000000000..f46e3219d --- /dev/null +++ b/structfbgemm__gpu_1_1_vec_n_t_3_014_00_01_primitive_type_1_1_i_n_t_01_4.html @@ -0,0 +1,503 @@ + + + + + + + +fbgemm_gpu: VecNT< 4, PrimitiveType::INT > Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  VecNT< 4, PrimitiveType::INT > Struct Reference
                  +
                  +
                  +

                  Constructor & Destructor Documentation

                  + +

                  ◆ VecNT() [1/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE VecNT ()
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ VecNT() [2/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE VecNT (uint32_t v,
                  half2 shift_scale )
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ add()

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void add (uint32_t v,
                  half2 shift_scale )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ fma()

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void fma (uint32_t v,
                  half2 shift_scale,
                  float b )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ mul()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void mul (float a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [1/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::BFloat16 * output_ptr,
                  const int num_valid_outputs = 4 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [2/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::BFloat16 * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 4 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [3/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::Half * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 4 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [4/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::Half * output_ptr,
                  int num_valid_outputs = 4 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [5/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (float * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 4 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [6/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (float * output_ptr,
                  int num_valid_outputs = 4 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [7/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (uint8_t * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 4 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [8/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (uint8_t * output_ptr,
                  int num_valid_outputs = 4 )
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Data Documentation

                  + +

                  ◆ acc

                  + +
                  +
                  + + + + +
                  float4 acc
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4-members.html b/structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4-members.html new file mode 100644 index 000000000..511273869 --- /dev/null +++ b/structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4-members.html @@ -0,0 +1,104 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  VecNT< 8, PrimitiveType::INT > Member List
                  +
                  +
                  + +

                  This is the complete list of members for VecNT< 8, PrimitiveType::INT >, including all inherited members.

                  + + + + + + + + + + + + + + + +
                  accVecNT< 8, PrimitiveType::INT >
                  add(uint32_t v, half2 shift_scale)VecNT< 8, PrimitiveType::INT >inline
                  fma(uint32_t v, half2 shift_scale, float b)VecNT< 8, PrimitiveType::INT >inline
                  mul(float a)VecNT< 8, PrimitiveType::INT >inline
                  store(float *output_ptr, int num_valid_outputs=8)VecNT< 8, PrimitiveType::INT >inline
                  store(at::Half *output_ptr, int num_valid_outputs=8)VecNT< 8, PrimitiveType::INT >inline
                  store(at::BFloat16 *output_ptr, const int num_valid_outputs=8)VecNT< 8, PrimitiveType::INT >inline
                  store(uint8_t *output_ptr, int num_valid_outputs=8)VecNT< 8, PrimitiveType::INT >inline
                  store(uint8_t *output_ptr, float2 qparams, int num_valid_outputs=8)VecNT< 8, PrimitiveType::INT >inline
                  store(float *output_ptr, float2 qparams, int num_valid_outputs=8)VecNT< 8, PrimitiveType::INT >inline
                  store(at::Half *output_ptr, float2 qparams, int num_valid_outputs=8)VecNT< 8, PrimitiveType::INT >inline
                  store(at::BFloat16 *output_ptr, float2 qparams, int num_valid_outputs=8)VecNT< 8, PrimitiveType::INT >inline
                  VecNT()VecNT< 8, PrimitiveType::INT >inline
                  VecNT(uint32_t v, half2 shift_scale)VecNT< 8, PrimitiveType::INT >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html b/structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html new file mode 100644 index 000000000..8ee10e84d --- /dev/null +++ b/structfbgemm__gpu_1_1_vec_n_t_3_018_00_01_primitive_type_1_1_i_n_t_01_4.html @@ -0,0 +1,503 @@ + + + + + + + +fbgemm_gpu: VecNT< 8, PrimitiveType::INT > Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  VecNT< 8, PrimitiveType::INT > Struct Reference
                  +
                  +
                  +

                  Constructor & Destructor Documentation

                  + +

                  ◆ VecNT() [1/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE VecNT ()
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ VecNT() [2/2]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE VecNT (uint32_t v,
                  half2 shift_scale )
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ add()

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void add (uint32_t v,
                  half2 shift_scale )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ fma()

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void fma (uint32_t v,
                  half2 shift_scale,
                  float b )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ mul()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void mul (float a)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [1/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::BFloat16 * output_ptr,
                  const int num_valid_outputs = 8 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [2/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::BFloat16 * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 8 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [3/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::Half * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 8 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [4/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (at::Half * output_ptr,
                  int num_valid_outputs = 8 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [5/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (float * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 8 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [6/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (float * output_ptr,
                  int num_valid_outputs = 8 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [7/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (uint8_t * output_ptr,
                  float2 qparams,
                  int num_valid_outputs = 8 )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store() [8/8]

                  + +
                  +
                  + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE void store (uint8_t * output_ptr,
                  int num_valid_outputs = 8 )
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Data Documentation

                  + +

                  ◆ acc

                  + +
                  +
                  + + + + +
                  float8 acc
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1_weight_row-members.html b/structfbgemm__gpu_1_1_weight_row-members.html new file mode 100644 index 000000000..fe7a65d2a --- /dev/null +++ b/structfbgemm__gpu_1_1_weight_row-members.html @@ -0,0 +1,103 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  WeightRow< emb_t, cache_t, dst_t > Member List
                  +
                  +
                  + +

                  This is the complete list of members for WeightRow< emb_t, cache_t, dst_t >, including all inherited members.

                  + + + + + + + + + + + + + + +
                  cache_row_WeightRow< emb_t, cache_t, dst_t >
                  dim_WeightRow< emb_t, cache_t, dst_t >
                  evict(const Vec4T< dst_t > &v, const int32_t d, const float2 qparams)WeightRow< emb_t, cache_t, dst_t >inline
                  load(const int32_t d, const float2 qparams) constWeightRow< emb_t, cache_t, dst_t >inline
                  load_qparams() constWeightRow< emb_t, cache_t, dst_t >inline
                  row_WeightRow< emb_t, cache_t, dst_t >
                  set_stochastic_rounding(const bool stochastic_rounding, const at::PhiloxCudaState stochastic_rounding_philox_args, const uint64_t salt_value)WeightRow< emb_t, cache_t, dst_t >inline
                  stoc_rounding_state_WeightRow< emb_t, cache_t, dst_t >
                  store(const Vec4T< dst_t > &v, const int32_t d, const float2 qparams)WeightRow< emb_t, cache_t, dst_t >inline
                  store_qparams(const float2 qparams)WeightRow< emb_t, cache_t, dst_t >inline
                  warp_copy_to(WeightRow< emb_t, cache_t, cache_t > &target, const int32_t dim_length, const int32_t num_lanes, const int32_t lane_id) constWeightRow< emb_t, cache_t, dst_t >inline
                  warp_evict(const int32_t dim_length, const int32_t num_lanes, const int32_t lane_id)WeightRow< emb_t, cache_t, dst_t >inline
                  WeightRow(emb_t *row, cache_t *cache_row, int dim, StochasticRoundingRNGState *stoc_rounding_state)WeightRow< emb_t, cache_t, dst_t >inline
                  + + + + diff --git a/structfbgemm__gpu_1_1_weight_row.html b/structfbgemm__gpu_1_1_weight_row.html new file mode 100644 index 000000000..2760bb627 --- /dev/null +++ b/structfbgemm__gpu_1_1_weight_row.html @@ -0,0 +1,470 @@ + + + + + + + +fbgemm_gpu: WeightRow< emb_t, cache_t, dst_t > Struct Template Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  WeightRow< emb_t, cache_t, dst_t > Struct Template Reference
                  +
                  +
                  +

                  Constructor & Destructor Documentation

                  + +

                  ◆ WeightRow()

                  + +
                  +
                  + + + + + + +
                  + + + + + + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE WeightRow (emb_t * row,
                  cache_t * cache_row,
                  int dim,
                  StochasticRoundingRNGState * stoc_rounding_state )
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Function Documentation

                  + +

                  ◆ evict()

                  + +
                  +
                  + + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void evict (const Vec4T< dst_t > & v,
                  const int32_t d,
                  const float2 qparams )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load()

                  + +
                  +
                  + + + + + + +
                  + + + + + + + + + + + +
                  DEVICE_INLINE Vec4T< dst_t > load (const int32_t d,
                  const float2 qparams ) const
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ load_qparams()

                  + +
                  +
                  + + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE float2 load_qparams () const
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ set_stochastic_rounding()

                  + +
                  +
                  + + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void set_stochastic_rounding (const bool stochastic_rounding,
                  const at::PhiloxCudaState stochastic_rounding_philox_args,
                  const uint64_t salt_value )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store()

                  + +
                  +
                  + + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void store (const Vec4T< dst_t > & v,
                  const int32_t d,
                  const float2 qparams )
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ store_qparams()

                  + +
                  +
                  + + + + + + +
                  + + + + + + + +
                  DEVICE_INLINE void store_qparams (const float2 qparams)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ warp_copy_to()

                  + +
                  +
                  + + + + + + +
                  + + + + + + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void warp_copy_to (WeightRow< emb_t, cache_t, cache_t > & target,
                  const int32_t dim_length,
                  const int32_t num_lanes,
                  const int32_t lane_id ) const
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ warp_evict()

                  + +
                  +
                  + + + + + + +
                  + + + + + + + + + + + + + + + + +
                  DEVICE_INLINE void warp_evict (const int32_t dim_length,
                  const int32_t num_lanes,
                  const int32_t lane_id )
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Data Documentation

                  + +

                  ◆ cache_row_

                  + +
                  +
                  + + + + + +
                  cache_t* cache_row_
                  +
                  + +
                  +
                  + +

                  ◆ dim_

                  + +
                  +
                  + + + + + +
                  int dim_
                  +
                  + +
                  +
                  + +

                  ◆ row_

                  + +
                  +
                  + + + + + +
                  emb_t* row_
                  +
                  + +
                  +
                  + +

                  ◆ stoc_rounding_state_

                  + +
                  +
                  + + + + + +
                  StochasticRoundingRNGState* stoc_rounding_state_
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structfbgemm__gpu_1_1rk__state-members.html b/structfbgemm__gpu_1_1rk__state-members.html new file mode 100644 index 000000000..57709d3fc --- /dev/null +++ b/structfbgemm__gpu_1_1rk__state-members.html @@ -0,0 +1,93 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  rk_state Member List
                  +
                  +
                  + +

                  This is the complete list of members for rk_state, including all inherited members.

                  + + + + +
                  gaussrk_state
                  has_gaussrk_state
                  xor128rk_state
                  + + + + diff --git a/structfbgemm__gpu_1_1rk__state.html b/structfbgemm__gpu_1_1rk__state.html new file mode 100644 index 000000000..30e4b038b --- /dev/null +++ b/structfbgemm__gpu_1_1rk__state.html @@ -0,0 +1,135 @@ + + + + + + + +fbgemm_gpu: rk_state Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  rk_state Struct Reference
                  +
                  +
                  +

                  Member Data Documentation

                  + +

                  ◆ gauss

                  + +
                  +
                  + + + + +
                  double gauss
                  +
                  + +
                  +
                  + +

                  ◆ has_gauss

                  + +
                  +
                  + + + + +
                  int has_gauss
                  +
                  + +
                  +
                  + +

                  ◆ xor128

                  + +
                  +
                  + + + + +
                  unsigned int xor128[4]
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structinternal_1_1_hyper_compressed_sparse_column-members.html b/structinternal_1_1_hyper_compressed_sparse_column-members.html new file mode 100644 index 000000000..1bedfd32e --- /dev/null +++ b/structinternal_1_1_hyper_compressed_sparse_column-members.html @@ -0,0 +1,97 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  HyperCompressedSparseColumn Member List
                  +
                  + + + + + diff --git a/structinternal_1_1_hyper_compressed_sparse_column.html b/structinternal_1_1_hyper_compressed_sparse_column.html new file mode 100644 index 000000000..10ee7003b --- /dev/null +++ b/structinternal_1_1_hyper_compressed_sparse_column.html @@ -0,0 +1,205 @@ + + + + + + + +fbgemm_gpu: HyperCompressedSparseColumn Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  HyperCompressedSparseColumn Struct Reference
                  +
                  +
                  + +

                  #include <embedding_forward_split_cpu.h>

                  +

                  Constructor & Destructor Documentation

                  + +

                  ◆ ~HyperCompressedSparseColumn()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  ~HyperCompressedSparseColumn ()
                  +
                  +inline
                  +
                  + +
                  +
                  +

                  Member Data Documentation

                  + +

                  ◆ column_segment_ids

                  + +
                  +
                  + + + + +
                  int* column_segment_ids = nullptr
                  +
                  + +
                  +
                  + +

                  ◆ column_segment_indices

                  + +
                  +
                  + + + + +
                  int* column_segment_indices = nullptr
                  +
                  + +
                  +
                  + +

                  ◆ column_segment_ptr

                  + +
                  +
                  + + + + +
                  int* column_segment_ptr = nullptr
                  +
                  + +
                  +
                  + +

                  ◆ num_non_zero_columns

                  + +
                  +
                  + + + + +
                  int num_non_zero_columns
                  +
                  + +
                  +
                  + +

                  ◆ row_indices

                  + +
                  +
                  + + + + +
                  int* row_indices = nullptr
                  +
                  + +
                  +
                  + +

                  ◆ weights

                  + +
                  +
                  + + + + +
                  float* weights = nullptr
                  +
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structlog2__calc-members.html b/structlog2__calc-members.html new file mode 100644 index 000000000..5d6c76e37 --- /dev/null +++ b/structlog2__calc-members.html @@ -0,0 +1,87 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + +
                  +
                  +
                  log2_calc< x > Member List
                  +
                  +
                  + +

                  This is the complete list of members for log2_calc< x >, including all inherited members.

                  + + +
                  value enum valuelog2_calc< x >
                  + + + + diff --git a/structlog2__calc.html b/structlog2__calc.html new file mode 100644 index 000000000..1ce6193a8 --- /dev/null +++ b/structlog2__calc.html @@ -0,0 +1,110 @@ + + + + + + + +fbgemm_gpu: log2_calc< x > Struct Template Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + +
                  +
                  + +
                  log2_calc< x > Struct Template Reference
                  +
                  +
                  + +

                  #include <sparse_ops_utils.h>

                  +

                  Member Enumeration Documentation

                  + +

                  ◆ anonymous enum

                  + +
                  +
                  +
                  +template<int x>
                  + + + + +
                  anonymous enum
                  +
                  + + +
                  Enumerator
                  value 
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structlog2__calc__-members.html b/structlog2__calc__-members.html new file mode 100644 index 000000000..d55b14098 --- /dev/null +++ b/structlog2__calc__-members.html @@ -0,0 +1,87 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + +
                  +
                  +
                  log2_calc_< x > Member List
                  +
                  +
                  + +

                  This is the complete list of members for log2_calc_< x >, including all inherited members.

                  + + +
                  value enum valuelog2_calc_< x >
                  + + + + diff --git a/structlog2__calc__.html b/structlog2__calc__.html new file mode 100644 index 000000000..31a330d92 --- /dev/null +++ b/structlog2__calc__.html @@ -0,0 +1,110 @@ + + + + + + + +fbgemm_gpu: log2_calc_< x > Struct Template Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + +
                  +
                  + +
                  log2_calc_< x > Struct Template Reference
                  +
                  +
                  + +

                  #include <sparse_ops_utils.h>

                  +

                  Member Enumeration Documentation

                  + +

                  ◆ anonymous enum

                  + +
                  +
                  +
                  +template<int x>
                  + + + + +
                  anonymous enum
                  +
                  + + +
                  Enumerator
                  value 
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/structlog2__calc___3_010_01_4-members.html b/structlog2__calc___3_010_01_4-members.html new file mode 100644 index 000000000..94a5daf26 --- /dev/null +++ b/structlog2__calc___3_010_01_4-members.html @@ -0,0 +1,87 @@ + + + + + + + +fbgemm_gpu: Member List + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + +
                  +
                  +
                  log2_calc_< 0 > Member List
                  +
                  +
                  + +

                  This is the complete list of members for log2_calc_< 0 >, including all inherited members.

                  + + +
                  value enum valuelog2_calc_< 0 >
                  + + + + diff --git a/structlog2__calc___3_010_01_4.html b/structlog2__calc___3_010_01_4.html new file mode 100644 index 000000000..7dfce26b8 --- /dev/null +++ b/structlog2__calc___3_010_01_4.html @@ -0,0 +1,108 @@ + + + + + + + +fbgemm_gpu: log2_calc_< 0 > Struct Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + +
                  +
                  + +
                  log2_calc_< 0 > Struct Reference
                  +
                  +
                  + +

                  #include <sparse_ops_utils.h>

                  +

                  Member Enumeration Documentation

                  + +

                  ◆ anonymous enum

                  + +
                  +
                  + + + + +
                  anonymous enum
                  +
                  + + +
                  Enumerator
                  value 
                  + +
                  +
                  +
                  The documentation for this struct was generated from the following file: +
                  + + + + diff --git a/tabs.css b/tabs.css index 71c8a4704..fe4854aa5 100644 --- a/tabs.css +++ b/tabs.css @@ -1 +1 @@ -.sm{position:relative;z-index:9999}.sm,.sm ul,.sm li{display:block;list-style:none;margin:0;padding:0;line-height:normal;direction:ltr;text-align:left;-webkit-tap-highlight-color:rgba(0,0,0,0)}.sm-rtl,.sm-rtl ul,.sm-rtl li{direction:rtl;text-align:right}.sm>li>h1,.sm>li>h2,.sm>li>h3,.sm>li>h4,.sm>li>h5,.sm>li>h6{margin:0;padding:0}.sm ul{display:none}.sm li,.sm a{position:relative}.sm a{display:block}.sm a.disabled{cursor:not-allowed}.sm:after{content:"\00a0";display:block;height:0;font:0/0 serif;clear:both;visibility:hidden;overflow:hidden}.sm,.sm *,.sm *:before,.sm *:after{-moz-box-sizing:border-box;-webkit-box-sizing:border-box;box-sizing:border-box}.main-menu-btn{position:relative;display:inline-block;width:36px;height:36px;text-indent:36px;margin-left:8px;white-space:nowrap;overflow:hidden;cursor:pointer;-webkit-tap-highlight-color:rgba(0,0,0,0)}.main-menu-btn-icon,.main-menu-btn-icon:before,.main-menu-btn-icon:after{position:absolute;top:50%;left:2px;height:2px;width:24px;background:var(--nav-menu-button-color);-webkit-transition:all .25s;transition:all .25s}.main-menu-btn-icon:before{content:'';top:-7px;left:0}.main-menu-btn-icon:after{content:'';top:7px;left:0}#main-menu-state:checked ~ .main-menu-btn .main-menu-btn-icon{height:0}#main-menu-state:checked ~ .main-menu-btn .main-menu-btn-icon:before{top:0;-webkit-transform:rotate(-45deg);transform:rotate(-45deg)}#main-menu-state:checked ~ .main-menu-btn .main-menu-btn-icon:after{top:0;-webkit-transform:rotate(45deg);transform:rotate(45deg)}#main-menu-state{position:absolute;width:1px;height:1px;margin:-1px;border:0;padding:0;overflow:hidden;clip:rect(1px,1px,1px,1px)}#main-menu-state:not(:checked) ~ #main-menu{display:none}#main-menu-state:checked ~ #main-menu{display:block}@media(min-width:768px){.main-menu-btn{position:absolute;top:-99999px}#main-menu-state:not(:checked) ~ #main-menu{display:block}}.sm-dox{background-image:var(--nav-gradient-image)}.sm-dox a,.sm-dox a:focus,.sm-dox a:hover,.sm-dox a:active{padding:0 12px;padding-right:43px;font-family:var(--font-family-nav);font-size:13px;font-weight:bold;line-height:36px;text-decoration:none;text-shadow:var(--nav-text-normal-shadow);color:var(--nav-text-normal-color);outline:0}.sm-dox a:hover{background-image:var(--nav-gradient-active-image);background-repeat:repeat-x;color:var(--nav-text-hover-color);text-shadow:var(--nav-text-hover-shadow)}.sm-dox a.current{color:#d23600}.sm-dox a.disabled{color:#bbb}.sm-dox a span.sub-arrow{position:absolute;top:50%;margin-top:-14px;left:auto;right:3px;width:28px;height:28px;overflow:hidden;font:bold 12px/28px monospace !important;text-align:center;text-shadow:none;background:var(--nav-menu-toggle-color);-moz-border-radius:5px;-webkit-border-radius:5px;border-radius:5px}.sm-dox a span.sub-arrow:before{display:block;content:'+'}.sm-dox a.highlighted span.sub-arrow:before{display:block;content:'-'}.sm-dox>li:first-child>a,.sm-dox>li:first-child>:not(ul) a{-moz-border-radius:5px 5px 0 0;-webkit-border-radius:5px;border-radius:5px 5px 0 0}.sm-dox>li:last-child>a,.sm-dox>li:last-child>*:not(ul) a,.sm-dox>li:last-child>ul,.sm-dox>li:last-child>ul>li:last-child>a,.sm-dox>li:last-child>ul>li:last-child>*:not(ul) a,.sm-dox>li:last-child>ul>li:last-child>ul,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>a,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>*:not(ul) a,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>a,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>*:not(ul) a,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>ul,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>a,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>*:not(ul) a,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>ul{-moz-border-radius:0 0 5px 5px;-webkit-border-radius:0;border-radius:0 0 5px 5px}.sm-dox>li:last-child>a.highlighted,.sm-dox>li:last-child>*:not(ul) a.highlighted,.sm-dox>li:last-child>ul>li:last-child>a.highlighted,.sm-dox>li:last-child>ul>li:last-child>*:not(ul) a.highlighted,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>a.highlighted,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>*:not(ul) a.highlighted,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>a.highlighted,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>*:not(ul) a.highlighted,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>a.highlighted,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>*:not(ul) a.highlighted{-moz-border-radius:0;-webkit-border-radius:0;border-radius:0}.sm-dox ul{background:var(--nav-menu-background-color)}.sm-dox ul a,.sm-dox ul a:focus,.sm-dox ul a:hover,.sm-dox ul a:active{font-size:12px;border-left:8px solid transparent;line-height:36px;text-shadow:none;background-color:var(--nav-menu-background-color);background-image:none}.sm-dox ul a:hover{background-image:var(--nav-gradient-active-image);background-repeat:repeat-x;color:var(--nav-text-hover-color);text-shadow:0 1px 1px black}.sm-dox ul ul a,.sm-dox ul ul a:hover,.sm-dox ul ul a:focus,.sm-dox ul ul a:active{border-left:16px solid transparent}.sm-dox ul ul ul a,.sm-dox ul ul ul a:hover,.sm-dox ul ul ul a:focus,.sm-dox ul ul ul a:active{border-left:24px solid transparent}.sm-dox ul ul ul ul a,.sm-dox ul ul ul ul a:hover,.sm-dox ul ul ul ul a:focus,.sm-dox ul ul ul ul a:active{border-left:32px solid transparent}.sm-dox ul ul ul ul ul a,.sm-dox ul ul ul ul ul a:hover,.sm-dox ul ul ul ul ul a:focus,.sm-dox ul ul ul ul ul a:active{border-left:40px solid transparent}@media(min-width:768px){.sm-dox ul{position:absolute;width:12em}.sm-dox li{float:left}.sm-dox.sm-rtl li{float:right}.sm-dox ul li,.sm-dox.sm-rtl ul li,.sm-dox.sm-vertical li{float:none}.sm-dox a{white-space:nowrap}.sm-dox ul a,.sm-dox.sm-vertical a{white-space:normal}.sm-dox .sm-nowrap>li>a,.sm-dox .sm-nowrap>li>:not(ul) a{white-space:nowrap}.sm-dox{padding:0 10px;background-image:var(--nav-gradient-image);line-height:36px}.sm-dox a span.sub-arrow{top:50%;margin-top:-2px;right:12px;width:0;height:0;border-width:4px;border-style:solid dashed dashed dashed;border-color:var(--nav-text-normal-color) transparent transparent transparent;background:transparent;-moz-border-radius:0;-webkit-border-radius:0;border-radius:0}.sm-dox a,.sm-dox a:focus,.sm-dox a:active,.sm-dox a:hover,.sm-dox a.highlighted{padding:0 12px;background-image:var(--nav-separator-image);background-repeat:no-repeat;background-position:right;-moz-border-radius:0 !important;-webkit-border-radius:0;border-radius:0 !important}.sm-dox a:hover{background-image:var(--nav-gradient-active-image);background-repeat:repeat-x;color:var(--nav-text-hover-color);text-shadow:var(--nav-text-hover-shadow)}.sm-dox a:hover span.sub-arrow{border-color:var(--nav-text-hover-color) transparent transparent transparent}.sm-dox a.has-submenu{padding-right:24px}.sm-dox li{border-top:0}.sm-dox>li>ul:before,.sm-dox>li>ul:after{content:'';position:absolute;top:-18px;left:30px;width:0;height:0;overflow:hidden;border-width:9px;border-style:dashed dashed solid dashed;border-color:transparent transparent #bbb transparent}.sm-dox>li>ul:after{top:-16px;left:31px;border-width:8px;border-color:transparent transparent var(--nav-menu-background-color) transparent}.sm-dox ul{border:1px solid #bbb;padding:5px 0;background:var(--nav-menu-background-color);-moz-border-radius:5px !important;-webkit-border-radius:5px;border-radius:5px !important;-moz-box-shadow:0 5px 9px rgba(0,0,0,0.2);-webkit-box-shadow:0 5px 9px rgba(0,0,0,0.2);box-shadow:0 5px 9px rgba(0,0,0,0.2)}.sm-dox ul a span.sub-arrow{right:8px;top:50%;margin-top:-5px;border-width:5px;border-color:transparent transparent transparent var(--nav-menu-foreground-color);border-style:dashed dashed dashed solid}.sm-dox ul a,.sm-dox ul a:hover,.sm-dox ul a:focus,.sm-dox ul a:active,.sm-dox ul a.highlighted{color:var(--nav-menu-foreground-color);background-image:none;border:0 !important;color:var(--nav-menu-foreground-color);background-image:none}.sm-dox ul a:hover{background-image:var(--nav-gradient-active-image);background-repeat:repeat-x;color:var(--nav-text-hover-color);text-shadow:var(--nav-text-hover-shadow)}.sm-dox ul a:hover span.sub-arrow{border-color:transparent transparent transparent var(--nav-text-hover-color)}.sm-dox span.scroll-up,.sm-dox span.scroll-down{position:absolute;display:none;visibility:hidden;overflow:hidden;background:var(--nav-menu-background-color);height:36px}.sm-dox span.scroll-up:hover,.sm-dox span.scroll-down:hover{background:#eee}.sm-dox span.scroll-up:hover span.scroll-up-arrow,.sm-dox span.scroll-up:hover span.scroll-down-arrow{border-color:transparent transparent #d23600 transparent}.sm-dox span.scroll-down:hover span.scroll-down-arrow{border-color:#d23600 transparent transparent transparent}.sm-dox span.scroll-up-arrow,.sm-dox span.scroll-down-arrow{position:absolute;top:0;left:50%;margin-left:-6px;width:0;height:0;overflow:hidden;border-width:6px;border-style:dashed dashed solid dashed;border-color:transparent transparent var(--nav-menu-foreground-color) transparent}.sm-dox span.scroll-down-arrow{top:8px;border-style:solid dashed dashed dashed;border-color:var(--nav-menu-foreground-color) transparent transparent transparent}.sm-dox.sm-rtl a.has-submenu{padding-right:12px;padding-left:24px}.sm-dox.sm-rtl a span.sub-arrow{right:auto;left:12px}.sm-dox.sm-rtl.sm-vertical a.has-submenu{padding:10px 20px}.sm-dox.sm-rtl.sm-vertical a span.sub-arrow{right:auto;left:8px;border-style:dashed solid dashed dashed;border-color:transparent #555 transparent transparent}.sm-dox.sm-rtl>li>ul:before{left:auto;right:30px}.sm-dox.sm-rtl>li>ul:after{left:auto;right:31px}.sm-dox.sm-rtl ul a.has-submenu{padding:10px 20px !important}.sm-dox.sm-rtl ul a span.sub-arrow{right:auto;left:8px;border-style:dashed solid dashed dashed;border-color:transparent #555 transparent transparent}.sm-dox.sm-vertical{padding:10px 0;-moz-border-radius:5px;-webkit-border-radius:5px;border-radius:5px}.sm-dox.sm-vertical a{padding:10px 20px}.sm-dox.sm-vertical a:hover,.sm-dox.sm-vertical a:focus,.sm-dox.sm-vertical a:active,.sm-dox.sm-vertical a.highlighted{background:#fff}.sm-dox.sm-vertical a.disabled{background-image:var(--nav-gradient-image)}.sm-dox.sm-vertical a span.sub-arrow{right:8px;top:50%;margin-top:-5px;border-width:5px;border-style:dashed dashed dashed solid;border-color:transparent transparent transparent #555}.sm-dox.sm-vertical>li>ul:before,.sm-dox.sm-vertical>li>ul:after{display:none}.sm-dox.sm-vertical ul a{padding:10px 20px}.sm-dox.sm-vertical ul a:hover,.sm-dox.sm-vertical ul a:focus,.sm-dox.sm-vertical ul a:active,.sm-dox.sm-vertical ul a.highlighted{background:#eee}.sm-dox.sm-vertical ul a.disabled{background:var(--nav-menu-background-color)}} \ No newline at end of file +.sm{position:relative;z-index:9999}.sm,.sm ul,.sm li{display:block;list-style:none;margin:0;padding:0;line-height:normal;direction:ltr;text-align:left;-webkit-tap-highlight-color:rgba(0,0,0,0)}.sm-rtl,.sm-rtl ul,.sm-rtl li{direction:rtl;text-align:right}.sm>li>h1,.sm>li>h2,.sm>li>h3,.sm>li>h4,.sm>li>h5,.sm>li>h6{margin:0;padding:0}.sm ul{display:none}.sm li,.sm a{position:relative}.sm a{display:block}.sm a.disabled{cursor:not-allowed}.sm:after{content:"\00a0";display:block;height:0;font:0/0 serif;clear:both;visibility:hidden;overflow:hidden}.sm,.sm *,.sm *:before,.sm *:after{-moz-box-sizing:border-box;-webkit-box-sizing:border-box;box-sizing:border-box}.main-menu-btn{position:relative;display:inline-block;width:36px;height:36px;text-indent:36px;margin-left:8px;white-space:nowrap;overflow:hidden;cursor:pointer;-webkit-tap-highlight-color:rgba(0,0,0,0)}.main-menu-btn-icon,.main-menu-btn-icon:before,.main-menu-btn-icon:after{position:absolute;top:50%;left:2px;height:2px;width:24px;background:var(--nav-menu-button-color);-webkit-transition:all .25s;transition:all .25s}.main-menu-btn-icon:before{content:'';top:-7px;left:0}.main-menu-btn-icon:after{content:'';top:7px;left:0}#main-menu-state:checked ~ .main-menu-btn .main-menu-btn-icon{height:0}#main-menu-state:checked ~ .main-menu-btn .main-menu-btn-icon:before{top:0;-webkit-transform:rotate(-45deg);transform:rotate(-45deg)}#main-menu-state:checked ~ .main-menu-btn .main-menu-btn-icon:after{top:0;-webkit-transform:rotate(45deg);transform:rotate(45deg)}#main-menu-state{position:absolute;width:1px;height:1px;margin:-1px;border:0;padding:0;overflow:hidden;clip:rect(1px,1px,1px,1px)}#main-menu-state:not(:checked) ~ #main-menu{display:none}#main-menu-state:checked ~ #main-menu{display:block}@media(min-width:768px){.main-menu-btn{position:absolute;top:-99999px}#main-menu-state:not(:checked) ~ #main-menu{display:block}}.sm-dox{background-image:var(--nav-gradient-image)}.sm-dox a,.sm-dox a:focus,.sm-dox a:hover,.sm-dox a:active{padding:0 12px;padding-right:43px;font-family:var(--font-family-nav);font-size:13px;font-weight:bold;line-height:36px;text-decoration:none;text-shadow:var(--nav-text-normal-shadow);color:var(--nav-text-normal-color);outline:0}.sm-dox a:hover{background-image:var(--nav-gradient-active-image);background-repeat:repeat-x;color:var(--nav-text-hover-color);text-shadow:var(--nav-text-hover-shadow)}.sm-dox a.current{color:#d23600}.sm-dox a.disabled{color:#bbb}.sm-dox a span.sub-arrow{position:absolute;top:50%;margin-top:-14px;left:auto;right:3px;width:28px;height:28px;overflow:hidden;font:bold 12px/28px monospace !important;text-align:center;text-shadow:none;background:var(--nav-menu-toggle-color);-moz-border-radius:5px;-webkit-border-radius:5px;border-radius:5px}.sm-dox a span.sub-arrow:before{display:block;content:'+'}.sm-dox a.highlighted span.sub-arrow:before{display:block;content:'-'}.sm-dox>li:first-child>a,.sm-dox>li:first-child>:not(ul) a{-moz-border-radius:5px 5px 0 0;-webkit-border-radius:5px;border-radius:5px 5px 0 0}.sm-dox>li:last-child>a,.sm-dox>li:last-child>*:not(ul) a,.sm-dox>li:last-child>ul,.sm-dox>li:last-child>ul>li:last-child>a,.sm-dox>li:last-child>ul>li:last-child>*:not(ul) a,.sm-dox>li:last-child>ul>li:last-child>ul,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>a,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>*:not(ul) a,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>a,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>*:not(ul) a,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>ul,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>a,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>*:not(ul) a,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>ul{-moz-border-radius:0 0 5px 5px;-webkit-border-radius:0;border-radius:0 0 5px 5px}.sm-dox>li:last-child>a.highlighted,.sm-dox>li:last-child>*:not(ul) a.highlighted,.sm-dox>li:last-child>ul>li:last-child>a.highlighted,.sm-dox>li:last-child>ul>li:last-child>*:not(ul) a.highlighted,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>a.highlighted,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>*:not(ul) a.highlighted,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>a.highlighted,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>*:not(ul) a.highlighted,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>a.highlighted,.sm-dox>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>ul>li:last-child>*:not(ul) a.highlighted{-moz-border-radius:0;-webkit-border-radius:0;border-radius:0}.sm-dox ul{background:var(--nav-menu-background-color)}.sm-dox ul a,.sm-dox ul a:focus,.sm-dox ul a:hover,.sm-dox ul a:active{font-size:12px;border-left:8px solid transparent;line-height:36px;text-shadow:none;background-color:var(--nav-menu-background-color);background-image:none}.sm-dox ul a:hover{background-image:var(--nav-gradient-active-image);background-repeat:repeat-x;color:var(--nav-text-hover-color);text-shadow:0 1px 1px black}.sm-dox ul ul a,.sm-dox ul ul a:hover,.sm-dox ul ul a:focus,.sm-dox ul ul a:active{border-left:16px solid transparent}.sm-dox ul ul ul a,.sm-dox ul ul ul a:hover,.sm-dox ul ul ul a:focus,.sm-dox ul ul ul a:active{border-left:24px solid transparent}.sm-dox ul ul ul ul a,.sm-dox ul ul ul ul a:hover,.sm-dox ul ul ul ul a:focus,.sm-dox ul ul ul ul a:active{border-left:32px solid transparent}.sm-dox ul ul ul ul ul a,.sm-dox ul ul ul ul ul a:hover,.sm-dox ul ul ul ul ul a:focus,.sm-dox ul ul ul ul ul a:active{border-left:40px solid transparent}@media(min-width:768px){.sm-dox ul{position:absolute;width:12em}.sm-dox li{float:left}.sm-dox.sm-rtl li{float:right}.sm-dox ul li,.sm-dox.sm-rtl ul li,.sm-dox.sm-vertical li{float:none}.sm-dox a{white-space:nowrap}.sm-dox ul a,.sm-dox.sm-vertical a{white-space:normal}.sm-dox .sm-nowrap>li>a,.sm-dox .sm-nowrap>li>:not(ul) a{white-space:nowrap}.sm-dox{padding:0 10px;background-image:var(--nav-gradient-image);line-height:36px}.sm-dox a span.sub-arrow{top:50%;margin-top:-2px;right:12px;width:0;height:0;border-width:4px;border-style:solid dashed dashed dashed;border-color:var(--nav-text-normal-color) transparent transparent transparent;background:transparent;-moz-border-radius:0;-webkit-border-radius:0;border-radius:0}.sm-dox a,.sm-dox a:focus,.sm-dox a:active,.sm-dox a:hover,.sm-dox a.highlighted{padding:0 12px;background-image:var(--nav-separator-image);background-repeat:no-repeat;background-position:right;-moz-border-radius:0 !important;-webkit-border-radius:0;border-radius:0 !important}.sm-dox a:hover{background-image:var(--nav-gradient-active-image);background-repeat:repeat-x;color:var(--nav-text-hover-color);text-shadow:var(--nav-text-hover-shadow)}.sm-dox a:hover span.sub-arrow{border-color:var(--nav-text-hover-color) transparent transparent transparent}.sm-dox a.has-submenu{padding-right:24px}.sm-dox li{border-top:0}.sm-dox>li>ul:before,.sm-dox>li>ul:after{content:'';position:absolute;top:-18px;left:30px;width:0;height:0;overflow:hidden;border-width:9px;border-style:dashed dashed solid dashed;border-color:transparent transparent #bbb transparent}.sm-dox>li>ul:after{top:-16px;left:31px;border-width:8px;border-color:transparent transparent var(--nav-menu-background-color) transparent}.sm-dox ul{border:1px solid #bbb;padding:5px 0;background:var(--nav-menu-background-color);-moz-border-radius:5px !important;-webkit-border-radius:5px;border-radius:5px !important;-moz-box-shadow:0 5px 9px rgba(0,0,0,0.2);-webkit-box-shadow:0 5px 9px rgba(0,0,0,0.2);box-shadow:0 5px 9px rgba(0,0,0,0.2)}.sm-dox ul a span.sub-arrow{right:8px;top:50%;margin-top:-5px;border-width:5px;border-color:transparent transparent transparent var(--nav-menu-foreground-color);border-style:dashed dashed dashed solid}.sm-dox ul a,.sm-dox ul a:hover,.sm-dox ul a:focus,.sm-dox ul a:active,.sm-dox ul a.highlighted{color:var(--nav-menu-foreground-color);background-image:none;border:0 !important}.sm-dox ul a:hover{background-image:var(--nav-gradient-active-image);background-repeat:repeat-x;color:var(--nav-text-hover-color);text-shadow:var(--nav-text-hover-shadow)}.sm-dox ul a:hover span.sub-arrow{border-color:transparent transparent transparent var(--nav-text-hover-color)}.sm-dox span.scroll-up,.sm-dox span.scroll-down{position:absolute;display:none;visibility:hidden;overflow:hidden;background:var(--nav-menu-background-color);height:36px}.sm-dox span.scroll-up:hover,.sm-dox span.scroll-down:hover{background:#eee}.sm-dox span.scroll-up:hover span.scroll-up-arrow,.sm-dox span.scroll-up:hover span.scroll-down-arrow{border-color:transparent transparent #d23600 transparent}.sm-dox span.scroll-down:hover span.scroll-down-arrow{border-color:#d23600 transparent transparent transparent}.sm-dox span.scroll-up-arrow,.sm-dox span.scroll-down-arrow{position:absolute;top:0;left:50%;margin-left:-6px;width:0;height:0;overflow:hidden;border-width:6px;border-style:dashed dashed solid dashed;border-color:transparent transparent var(--nav-menu-foreground-color) transparent}.sm-dox span.scroll-down-arrow{top:8px;border-style:solid dashed dashed dashed;border-color:var(--nav-menu-foreground-color) transparent transparent transparent}.sm-dox.sm-rtl a.has-submenu{padding-right:12px;padding-left:24px}.sm-dox.sm-rtl a span.sub-arrow{right:auto;left:12px}.sm-dox.sm-rtl.sm-vertical a.has-submenu{padding:10px 20px}.sm-dox.sm-rtl.sm-vertical a span.sub-arrow{right:auto;left:8px;border-style:dashed solid dashed dashed;border-color:transparent #555 transparent transparent}.sm-dox.sm-rtl>li>ul:before{left:auto;right:30px}.sm-dox.sm-rtl>li>ul:after{left:auto;right:31px}.sm-dox.sm-rtl ul a.has-submenu{padding:10px 20px !important}.sm-dox.sm-rtl ul a span.sub-arrow{right:auto;left:8px;border-style:dashed solid dashed dashed;border-color:transparent #555 transparent transparent}.sm-dox.sm-vertical{padding:10px 0;-moz-border-radius:5px;-webkit-border-radius:5px;border-radius:5px}.sm-dox.sm-vertical a{padding:10px 20px}.sm-dox.sm-vertical a:hover,.sm-dox.sm-vertical a:focus,.sm-dox.sm-vertical a:active,.sm-dox.sm-vertical a.highlighted{background:#fff}.sm-dox.sm-vertical a.disabled{background-image:var(--nav-gradient-image)}.sm-dox.sm-vertical a span.sub-arrow{right:8px;top:50%;margin-top:-5px;border-width:5px;border-style:dashed dashed dashed solid;border-color:transparent transparent transparent #555}.sm-dox.sm-vertical>li>ul:before,.sm-dox.sm-vertical>li>ul:after{display:none}.sm-dox.sm-vertical ul a{padding:10px 20px}.sm-dox.sm-vertical ul a:hover,.sm-dox.sm-vertical ul a:focus,.sm-dox.sm-vertical ul a:active,.sm-dox.sm-vertical ul a.highlighted{background:#eee}.sm-dox.sm-vertical ul a.disabled{background:var(--nav-menu-background-color)}} \ No newline at end of file diff --git a/tensor__assert__test_8cpp.html b/tensor__assert__test_8cpp.html new file mode 100644 index 000000000..d81ea8818 --- /dev/null +++ b/tensor__assert__test_8cpp.html @@ -0,0 +1,112 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/test/tensor_assert_test.cpp File Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  tensor_assert_test.cpp File Reference
                  +
                  +
                  +
                  #include <ATen/ATen.h>
                  +#include <gtest/gtest.h>
                  +#include "fbgemm_gpu/sparse_ops_utils.h"
                  +

                  Function Documentation

                  + +

                  ◆ TEST()

                  + +
                  +
                  + + + + + + + + + + + +
                  TEST (tensor_assert_test ,
                  gpu_asserts  )
                  +
                  + +
                  +
                  +
                  + + + + diff --git a/topics.html b/topics.html index 7a3526d06..61bc94fe9 100644 --- a/topics.html +++ b/topics.html @@ -3,12 +3,14 @@ - + fbgemm_gpu: Topics + + @@ -29,7 +31,7 @@ - + @@ -74,28 +76,28 @@ diff --git a/topology__utils_8cpp.html b/topology__utils_8cpp.html new file mode 100644 index 000000000..9a5481c35 --- /dev/null +++ b/topology__utils_8cpp.html @@ -0,0 +1,124 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/topology_utils.cpp File Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  topology_utils.cpp File Reference
                  +
                  +
                  +
                  #include <ATen/cuda/CUDAContext.h>
                  +#include <c10/core/Device.h>
                  +#include <c10/cuda/CUDAException.h>
                  +#include <c10/util/Logging.h>
                  +#include <algorithm>
                  +#include "fbgemm_gpu/topology_utils.h"
                  +#include <nvml.h>
                  +
                  + + + +

                  +Namespaces

                  namespace  fbgemm_gpu
                   
                  +

                  Macro Definition Documentation

                  + +

                  ◆ NVML_CHECK

                  + +
                  +
                  + + + + + + + +
                  #define NVML_CHECK( fn)
                  +
                  +Value:
                  do { \
                  +
                  nvmlReturn_t ret = (fn); \
                  +
                  TORCH_CHECK_EQ((ret), NVML_SUCCESS); \
                  +
                  } while (0)
                  +
                  +
                  +
                  +
                  + + + + diff --git a/topology__utils_8h.html b/topology__utils_8h.html new file mode 100644 index 000000000..14fae9a3c --- /dev/null +++ b/topology__utils_8h.html @@ -0,0 +1,141 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/include/fbgemm_gpu/topology_utils.h File Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  + +
                  topology_utils.h File Reference
                  +
                  +
                  +
                  #include <functional>
                  +
                  + + + +

                  +Namespaces

                  namespace  fbgemm_gpu
                   
                  +

                  Typedef Documentation

                  + +

                  ◆ AdjacencyMatrix

                  + +
                  +
                  +
                  +template<typename T >
                  + + + + +
                  using AdjacencyMatrix = std::function<T(Node, Node)>
                  +
                  + +
                  +
                  + +

                  ◆ Links

                  + +
                  +
                  + + + + +
                  using Links = int64_t
                  +
                  + +
                  +
                  + +

                  ◆ Node

                  + +
                  +
                  + + + + +
                  using Node = int64_t
                  +
                  + +
                  +
                  +
                  + + + + diff --git a/transpose__embedding__input_8cu.html b/transpose__embedding__input_8cu.html new file mode 100644 index 000000000..8cc67f53a --- /dev/null +++ b/transpose__embedding__input_8cu.html @@ -0,0 +1,302 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/src/split_embeddings_utils/transpose_embedding_input.cu File Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  transpose_embedding_input.cu File Reference
                  +
                  +
                  +
                  #include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
                  +#include "fbgemm_gpu/ops_utils.h"
                  +#include "fbgemm_gpu/split_embeddings_utils.cuh"
                  +#include "fbgemm_gpu/cub_namespace_prefix.cuh"
                  +#include <cub/device/device_radix_sort.cuh>
                  +#include <cub/device/device_run_length_encode.cuh>
                  +#include <cub/device/device_scan.cuh>
                  +#include "fbgemm_gpu/cub_namespace_postfix.cuh"
                  +

                  Macro Definition Documentation

                  + +

                  ◆ INVOKE_LINEARIZE_INDEX_KERNEL

                  + +
                  +
                  + + + + + + + + + + + +
                  #define INVOKE_LINEARIZE_INDEX_KERNEL( INFO_ACC_T,
                  NOBAG )
                  +
                  +Value:
                  const auto linearize_index_kernel_ = \
                  + + + +
                  div_round_up(total_B, kMaxThreads), \
                  +
                  kMaxThreads, \
                  +
                  0, \
                  +
                  at::cuda::getCurrentCUDAStream()>>>( \
                  +
                  hash_size_cumsum.packed_accessor32<index_t, 1, RestrictPtrTraits>(), \
                  +
                  indices.packed_accessor32<index_t, 1, RestrictPtrTraits>(), \
                  +
                  offsets.packed_accessor32<index_t, 1, RestrictPtrTraits>(), \
                  +
                  infos.packed_accessor32<INFO_ACC_T, 1, RestrictPtrTraits>(), \
                  +
                  linear_indices.packed_accessor32<index_t, 1, RestrictPtrTraits>(), \
                  + + + +
                  (1u << info_B_num_bits) - 1, \
                  +
                  vbe ? reinterpret_cast<uint32_t*>(vbe_b_t_map.value().data_ptr()) \
                  +
                  : nullptr, \
                  + + +
                  Definition fbgemm_cuda_utils.cuh:3610
                  +
                  Definition fbgemm_tensor_accessor.h:128
                  +
                  template __global__ uint32_t
                  Definition gen_embedding_backward_split_grad.cu:137
                  +
                  __global__ const int32_t const int32_t T
                  Definition sparse_batched_unary_embeddings.cu:21
                  +
                  __global__ const int32_t const int32_t const scalar_t *__restrict__ const index_t *__restrict__ scalar_t *__restrict__ const at::PackedTensorAccessor32< index_t, 1, at::RestrictPtrTraits > const int32_t *__restrict__ const int32_t *__restrict__ const int32_t *__restrict__ const int32_t info_B_num_bits
                  Definition sparse_batched_unary_embeddings.cu:128
                  +
                  __global__ const int32_t const int32_t const scalar_t *__restrict__ const index_t *__restrict__ const index_t *__restrict__ const index_t *__restrict__ indices
                  Definition sparse_batched_unary_embeddings.cu:26
                  +
                  __global__ const int32_t const int32_t const scalar_t *__restrict__ const index_t *__restrict__ const index_t *__restrict__ offsets
                  Definition sparse_batched_unary_embeddings.cu:25
                  +
                  __global__ const int32_t const int32_t const scalar_t *__restrict__ const index_t *__restrict__ scalar_t *__restrict__ const at::PackedTensorAccessor32< index_t, 1, at::RestrictPtrTraits > const int32_t *__restrict__ const int32_t *__restrict__ const int32_t *__restrict__ const int32_t const uint32_t info_B_mask
                  Definition sparse_batched_unary_embeddings.cu:129
                  +
                  constexpr int DEFAULT_INFO_NUM_BITS
                  Definition split_embeddings_utils.cuh:17
                  +
                  +
                  +
                  +

                  Typedef Documentation

                  + +

                  ◆ Tensor

                  + +
                  +
                  + + + + +
                  using Tensor = at::Tensor
                  +
                  + +
                  +
                  +

                  Function Documentation

                  + +

                  ◆ __launch_bounds__() [1/2]

                  + +
                  +
                  + + + + + + + + +
                  __global__ __launch_bounds__ (kMaxThreads ) const
                  +
                  + +
                  +
                  + +

                  ◆ __launch_bounds__() [2/2]

                  + +
                  +
                  +
                  +template<typename index_t , typename info_acc_t , bool nobag, bool vbe>
                  + + + + + + + +
                  __global__ __launch_bounds__ (kMaxThreads ) const
                  +
                  + +
                  +
                  + +

                  ◆ asynchronous_complete_cumsum()

                  + +
                  +
                  + + + + + +
                  + + + + + + + +
                  at::Tensor asynchronous_complete_cumsum (at::Tensor t_in)
                  +
                  +inline
                  +
                  + +
                  +
                  + +

                  ◆ transpose_embedding_input()

                  + +
                  +
                  + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
                  DLL_PUBLIC std::tuple< Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor > transpose_embedding_input (Tensor hash_size_cumsum,
                  int64_t total_hash_size_bits,
                  Tensor indices,
                  Tensor offsets,
                  bool nobag,
                  const c10::optional< Tensor > & vbe_b_t_map,
                  const int64_t info_B_num_bits,
                  const int64_t info_B_mask,
                  const int64_t total_unique_indices,
                  const bool is_index_select,
                  const c10::optional< Tensor > & total_L_offsets,
                  const int64_t fixed_L_per_warp,
                  const int64_t num_warps_per_feature )
                  +
                  + +
                  +
                  +
                  + + + + diff --git a/uvm__cache__miss__emulate__test_8cpp.html b/uvm__cache__miss__emulate__test_8cpp.html new file mode 100644 index 000000000..c775eed4f --- /dev/null +++ b/uvm__cache__miss__emulate__test_8cpp.html @@ -0,0 +1,184 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/test/uvm_cache_miss_emulate_test.cpp File Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  uvm_cache_miss_emulate_test.cpp File Reference
                  +
                  +
                  +
                  #include <gtest/gtest.h>
                  +#include "fbgemm_gpu/split_embeddings_cache_cuda.cuh"
                  +

                  Function Documentation

                  + +

                  ◆ generate_lxu_cache_locations()

                  + +
                  +
                  + + + + + + + + + + + + + + + + +
                  at::Tensor generate_lxu_cache_locations (const int64_t num_requests,
                  const int64_t num_sets,
                  const int64_t associativity = 32 )
                  +
                  + +
                  +
                  + +

                  ◆ run_emulate_cache_miss()

                  + +
                  +
                  + + + + + + + + + + + + + + + + +
                  std::pair< at::Tensor, at::Tensor > run_emulate_cache_miss (at::Tensor lxu_cache_locations,
                  const int64_t enforced_misses_per_256,
                  const bool gather_uvm_stats = false )
                  +
                  + +
                  +
                  + +

                  ◆ TEST() [1/2]

                  + +
                  +
                  + + + + + + + + + + + +
                  TEST (uvm_cache_miss_emulate_test ,
                  enforced_cache_miss  )
                  +
                  + +
                  +
                  + +

                  ◆ TEST() [2/2]

                  + +
                  +
                  + + + + + + + + + + + +
                  TEST (uvm_cache_miss_emulate_test ,
                  no_cache_miss  )
                  +
                  + +
                  +
                  +
                  + + + + diff --git a/verify__fp16__stochastic__benchmark_8cu.html b/verify__fp16__stochastic__benchmark_8cu.html new file mode 100644 index 000000000..bc5887ce2 --- /dev/null +++ b/verify__fp16__stochastic__benchmark_8cu.html @@ -0,0 +1,429 @@ + + + + + + + +fbgemm_gpu: /__w/FBGEMM/FBGEMM/fbgemm_gpu/bench/verify_fp16_stochastic_benchmark.cu File Reference + + + + + + + + + + + +
                  +
                  + + + + + + +
                  +
                  fbgemm_gpu +
                  +
                  +
                  + + + + + + + + +
                  +
                  + + +
                  +
                  +
                  +
                  +
                  +
                  Loading...
                  +
                  Searching...
                  +
                  No Matches
                  +
                  +
                  +
                  +
                  + + +
                  +
                  +
                  verify_fp16_stochastic_benchmark.cu File Reference
                  +
                  +
                  +
                  #include <c10/cuda/CUDAException.h>
                  +#include <cuda.h>
                  +#include <cuda_fp16.h>
                  +#include <curand.h>
                  +#include <curand_kernel.h>
                  +#include <unistd.h>
                  +#include <chrono>
                  +#include <iostream>
                  +#include <vector>
                  +

                  Function Documentation

                  + +

                  ◆ convert_float_to_half_assemblefloat()

                  + +
                  +
                  + + + + + + + + + + + + + + + + + + + + + +
                  __global__ void convert_float_to_half_assemblefloat (half * dst,
                  float * src,
                  uint8_t * r,
                  int size )
                  +
                  + +
                  +
                  + +

                  ◆ convert_float_to_half_bitcarry()

                  + +
                  +
                  + + + + + + + + + + + + + + + + +
                  __global__ void convert_float_to_half_bitcarry (half * dst,
                  float * src,
                  int size )
                  +
                  + +
                  +
                  + +

                  ◆ convert_float_to_half_direct()

                  + +
                  +
                  + + + + + + + + + + + + + + + + +
                  __global__ void convert_float_to_half_direct (half * dst,
                  float * src,
                  int size )
                  +
                  + +
                  +
                  + +

                  ◆ convert_float_to_half_shortrand()

                  + +
                  +
                  + + + + + + + + + + + + + + + + + + + + + +
                  __global__ void convert_float_to_half_shortrand (half * dst,
                  float * src,
                  uint8_t * r,
                  int size )
                  +
                  + +
                  +
                  + +

                  ◆ float_to_sto_half_assemblefloat()

                  + +
                  +
                  + + + + + + + + + + + +
                  __device__ half float_to_sto_half_assemblefloat (float w,
                  uint8_t rand )
                  +
                  + +
                  +
                  + +

                  ◆ float_to_sto_half_bitcarry()

                  + +
                  +
                  + + + + + + + +
                  __device__ half float_to_sto_half_bitcarry (float w)
                  +
                  + +
                  +
                  + +

                  ◆ float_to_sto_half_direct()

                  + +
                  +
                  + + + + + + + +
                  __device__ half float_to_sto_half_direct (float w)
                  +
                  + +
                  +
                  + +

                  ◆ float_to_sto_half_shortrand()

                  + +
                  +
                  + + + + + + + + + + + +
                  __device__ half float_to_sto_half_shortrand (float w,
                  uint8_t rand )
                  +
                  + +
                  +
                  + +

                  ◆ flush_cache()

                  + +
                  +
                  + + + + + + + + + + + + + + + + + + + + + + + + + + +
                  void flush_cache (std::vector< char > flush,
                  char * d_flush,
                  char * d_flush2,
                  int cache_size,
                  bool do_write = false )
                  +
                  + +
                  +
                  + +

                  ◆ flush_gpu()

                  + +
                  +
                  + + + + + + + + + + + + + + + + +
                  __global__ void flush_gpu (char * d_flush,
                  char * d_flush2,
                  bool do_write )
                  +
                  + +
                  +
                  + +

                  ◆ gen_8bit_random()

                  + +
                  +
                  + + + + + + + + + + + +
                  void gen_8bit_random (uint8_t * d_random_number,
                  int test_size )
                  +
                  + +
                  +
                  + +

                  ◆ gen_data()

                  + +
                  +
                  + + + + + + + + + + + +
                  void gen_data (float * d_f32_array,
                  int test_size )
                  +
                  + +
                  +
                  + +

                  ◆ main()

                  + +
                  +
                  + + + + + + + + + + + +
                  int main (int argc,
                  char * argv[] )
                  +
                  + +
                  +
                  + +

                  ◆ two_to_e()

                  + +
                  +
                  + + + + + + + +
                  __device__ float two_to_e (float X)
                  +
                  + +
                  +
                  +
                  + + + +